From 11a8809c48351a3ea1533d4e5e86af3357ed240c Mon Sep 17 00:00:00 2001
From: Matt Lord <mattalord@gmail.com>
Date: Tue, 30 Nov 2021 20:41:17 -0500
Subject: [PATCH 1/4] Mark a replica as unhealthy when the repl lag is unknown

This prevents us from serving queries from the replica that may
be beyond the defined -unhealthy_threshold for the tablet.

Signed-off-by: Matt Lord <mattalord@gmail.com>
---
 go/mysql/flavor.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/go/mysql/flavor.go b/go/mysql/flavor.go
index 2902308d843..97265888bec 100644
--- a/go/mysql/flavor.go
+++ b/go/mysql/flavor.go
@@ -19,12 +19,14 @@ package mysql
 import (
 	"errors"
 	"fmt"
+	"math"
 	"strconv"
 	"strings"
 
 	"context"
 
 	"vitess.io/vitess/go/sqltypes"
+	"vitess.io/vitess/go/vt/log"
 	"vitess.io/vitess/go/vt/proto/vtrpc"
 	"vitess.io/vitess/go/vt/vterrors"
 )
@@ -315,7 +317,11 @@ func parseReplicationStatus(fields map[string]string) ReplicationStatus {
 	status.SourcePort = int(parseInt)
 	parseInt, _ = strconv.ParseInt(fields["Connect_Retry"], 10, 0)
 	status.ConnectRetry = int(parseInt)
-	parseUint, _ := strconv.ParseUint(fields["Seconds_Behind_Master"], 10, 0)
+	parseUint, err := strconv.ParseUint(fields["Seconds_Behind_Master"], 10, 0)
+	if err != nil {
+		log.Errorf("Could not compute replica lag from seconds_behind_master value of '%s', this means that replication is unhealthy; setting replica lag to max value to prevent the serving of queries", fields["Seconds_Behind_Master"])
+		parseUint = math.MaxUint32
+	}
 	status.ReplicationLagSeconds = uint(parseUint)
 	parseUint, _ = strconv.ParseUint(fields["Master_Server_Id"], 10, 0)
 	status.SourceServerID = uint(parseUint)

From ef39eb8ed4084c7aa06d3d1740745c9ac4a6b645 Mon Sep 17 00:00:00 2001
From: Matt Lord <mattalord@gmail.com>
Date: Wed, 1 Dec 2021 12:18:13 -0500
Subject: [PATCH 2/4] Add e2e test for replica health when its source is gone

Signed-off-by: Matt Lord <mattalord@gmail.com>
---
 go/test/endtoend/tabletmanager/main_test.go   | 32 +++++-----
 .../tabletmanager/tablet_health_test.go       | 58 +++++++++++++++++--
 2 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/go/test/endtoend/tabletmanager/main_test.go b/go/test/endtoend/tabletmanager/main_test.go
index 08770665c49..5ac97fe4f20 100644
--- a/go/test/endtoend/tabletmanager/main_test.go
+++ b/go/test/endtoend/tabletmanager/main_test.go
@@ -35,21 +35,22 @@ import (
 )
 
 var (
-	clusterInstance     *cluster.LocalProcessCluster
-	tmClient            *tmc.Client
-	primaryTabletParams mysql.ConnParams
-	replicaTabletParams mysql.ConnParams
-	primaryTablet       cluster.Vttablet
-	replicaTablet       cluster.Vttablet
-	rdonlyTablet        cluster.Vttablet
-	hostname            = "localhost"
-	keyspaceName        = "ks"
-	shardName           = "0"
-	keyspaceShard       = "ks/" + shardName
-	dbName              = "vt_" + keyspaceName
-	username            = "vt_dba"
-	cell                = "zone1"
-	sqlSchema           = `
+	clusterInstance                  *cluster.LocalProcessCluster
+	tmClient                         *tmc.Client
+	primaryTabletParams              mysql.ConnParams
+	replicaTabletParams              mysql.ConnParams
+	primaryTablet                    cluster.Vttablet
+	replicaTablet                    cluster.Vttablet
+	rdonlyTablet                     cluster.Vttablet
+	hostname                         = "localhost"
+	keyspaceName                     = "ks"
+	shardName                        = "0"
+	keyspaceShard                    = "ks/" + shardName
+	dbName                           = "vt_" + keyspaceName
+	username                         = "vt_dba"
+	cell                             = "zone1"
+	tabletHealthcheckRefreshInterval = 5 * time.Second
+	sqlSchema                        = `
 	create table t1(
 		id bigint,
 		value varchar(16),
@@ -99,6 +100,7 @@ func TestMain(m *testing.M) {
 			"-lock_tables_timeout", "5s",
 			"-watch_replication_stream",
 			"-enable_replication_reporter",
+			"-health_check_interval", tabletHealthcheckRefreshInterval.String(),
 		}
 		// We do not need semiSync for this test case.
 		clusterInstance.EnableSemiSync = false
diff --git a/go/test/endtoend/tabletmanager/tablet_health_test.go b/go/test/endtoend/tabletmanager/tablet_health_test.go
index 81946a1edea..d7afdb44615 100644
--- a/go/test/endtoend/tabletmanager/tablet_health_test.go
+++ b/go/test/endtoend/tabletmanager/tablet_health_test.go
@@ -24,6 +24,7 @@ import (
 	"strings"
 	"sync"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/require"
 
@@ -134,7 +135,7 @@ func TestHealthCheck(t *testing.T) {
 	// make sure the health stream is updated
 	result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias)
 	require.NoError(t, err)
-	verifyStreamHealth(t, result)
+	verifyStreamHealth(t, result, true)
 
 	// then restart replication, make sure we stay healthy
 	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rTablet.Alias)
@@ -148,7 +149,47 @@ func TestHealthCheck(t *testing.T) {
 	require.NoError(t, err)
 	scanner := bufio.NewScanner(strings.NewReader(result))
 	for scanner.Scan() {
-		verifyStreamHealth(t, scanner.Text())
+		verifyStreamHealth(t, scanner.Text(), true)
+	}
+
+	// stop the replica's source mysqld instance to break replication
+	// and test that the tablet becomes unhealthy and non-serving
+	err = primaryTablet.MysqlctlProcess.Stop()
+	require.NoError(t, err)
+
+	time.Sleep(tabletHealthcheckRefreshInterval)
+
+	// now the replica's VtTabletStreamHealth should show it as unhealthy
+	result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias)
+	require.NoError(t, err)
+	scanner = bufio.NewScanner(strings.NewReader(result))
+	for scanner.Scan() {
+		verifyStreamHealth(t, scanner.Text(), false)
+	}
+
+	// start the primary tablet's mysqld back up
+	primaryTablet.MysqlctlProcess.InitMysql = false
+	err = primaryTablet.MysqlctlProcess.Start()
+	primaryTablet.MysqlctlProcess.InitMysql = true
+	require.NoError(t, err)
+
+	// explicitly start replication on all of the replicas to avoid any test flakiness as they were all
+	// replicating from the primary instance
+	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rTablet.Alias)
+	require.NoError(t, err)
+	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", replicaTablet.Alias)
+	require.NoError(t, err)
+	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rdonlyTablet.Alias)
+	require.NoError(t, err)
+
+	time.Sleep(tabletHealthcheckRefreshInterval)
+
+	// now the replica's VtTabletStreamHealth should show it as healthy again
+	result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias)
+	require.NoError(t, err)
+	scanner = bufio.NewScanner(strings.NewReader(result))
+	for scanner.Scan() {
+		verifyStreamHealth(t, scanner.Text(), true)
 	}
 
 	// Manual cleanup of processes
@@ -183,7 +224,7 @@ func checkTabletType(t *testing.T, tabletAlias string, typeWant string) {
 	assert.Equal(t, want, got)
 }
 
-func verifyStreamHealth(t *testing.T, result string) {
+func verifyStreamHealth(t *testing.T, result string, expectHealthy bool) {
 	var streamHealthResponse querypb.StreamHealthResponse
 	err := json2.Unmarshal([]byte(result), &streamHealthResponse)
 	require.NoError(t, err)
@@ -191,10 +232,15 @@ func verifyStreamHealth(t *testing.T, result string) {
 	UID := streamHealthResponse.GetTabletAlias().GetUid()
 	realTimeStats := streamHealthResponse.GetRealtimeStats()
 	replicationLagSeconds := realTimeStats.GetReplicationLagSeconds()
-	assert.True(t, serving, "Tablet should be in serving state")
 	assert.True(t, UID > 0, "Tablet should contain uid")
-	// replicationLagSeconds varies till 7200 so setting safe limit
-	assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary")
+	if expectHealthy {
+		assert.True(t, serving, "Tablet should be in serving state")
+		// replicationLagSeconds varies till 7200 so setting safe limit
+		assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary")
+	} else {
+		assert.False(t, serving, "Tablet should not be in serving state")
+		assert.False(t, replicationLagSeconds < 10000, "replica should be behind primary")
+	}
 }
 
 func TestHealthCheckDrainedStateDoesNotShutdownQueryService(t *testing.T) {

From c5143d84164ab80571b603c00716b7ecc1248bc8 Mon Sep 17 00:00:00 2001
From: Matt Lord <mattalord@gmail.com>
Date: Thu, 2 Dec 2021 15:53:34 -0500
Subject: [PATCH 3/4] Calculate replica lag ourselves when it's unknown in
 MySQL

This gives us the most accurate estimate possible -- and seconds_behind_master
is already fuzzy anyway -- of the replica lag when the replica cannot
talk to its source and is constantly retrying and in a state where
Slave_IO_Running == Connecting which we consider to be equivalent to
running (see parseReplicationStatus() in flavor.go) as reconnects happen
regularly anyway e.g. due to slave_net_timeout.

Signed-off-by: Matt Lord <mattalord@gmail.com>
---
 go/mysql/flavor.go                                | 11 ++++++-----
 go/mysql/replication_status.go                    |  4 ++++
 go/vt/vttablet/tabletserver/repltracker/poller.go |  7 ++++++-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/go/mysql/flavor.go b/go/mysql/flavor.go
index 97265888bec..7204a36a916 100644
--- a/go/mysql/flavor.go
+++ b/go/mysql/flavor.go
@@ -19,14 +19,12 @@ package mysql
 import (
 	"errors"
 	"fmt"
-	"math"
 	"strconv"
 	"strings"
 
 	"context"
 
 	"vitess.io/vitess/go/sqltypes"
-	"vitess.io/vitess/go/vt/log"
 	"vitess.io/vitess/go/vt/proto/vtrpc"
 	"vitess.io/vitess/go/vt/vterrors"
 )
@@ -319,10 +317,13 @@ func parseReplicationStatus(fields map[string]string) ReplicationStatus {
 	status.ConnectRetry = int(parseInt)
 	parseUint, err := strconv.ParseUint(fields["Seconds_Behind_Master"], 10, 0)
 	if err != nil {
-		log.Errorf("Could not compute replica lag from seconds_behind_master value of '%s', this means that replication is unhealthy; setting replica lag to max value to prevent the serving of queries", fields["Seconds_Behind_Master"])
-		parseUint = math.MaxUint32
+		// we could not parse the value into a valid uint -- most commonly because the value is NULL from the
+		// database -- so let's reflect that the underlying value was unknown on our last check
+		status.ReplicationLagUnknown = true
+	} else {
+		status.ReplicationLagUnknown = false
+		status.ReplicationLagSeconds = uint(parseUint)
 	}
-	status.ReplicationLagSeconds = uint(parseUint)
 	parseUint, _ = strconv.ParseUint(fields["Master_Server_Id"], 10, 0)
 	status.SourceServerID = uint(parseUint)
 
diff --git a/go/mysql/replication_status.go b/go/mysql/replication_status.go
index 801d38a680f..5decc99033e 100644
--- a/go/mysql/replication_status.go
+++ b/go/mysql/replication_status.go
@@ -30,6 +30,9 @@ type ReplicationStatus struct {
 	// were to finish executing everything that's currently in its relay log.
 	// However, some MySQL flavors don't expose this information,
 	// in which case RelayLogPosition.IsZero() will be true.
+	// If ReplicationLagUnknown is true then we should not rely on the seconds
+	// behind value and we can instead try to calcuate the lag ourselves when
+	// appropriate.
 	RelayLogPosition      Position
 	FilePosition          Position
 	FileRelayLogPosition  Position
@@ -37,6 +40,7 @@ type ReplicationStatus struct {
 	IOThreadRunning       bool
 	SQLThreadRunning      bool
 	ReplicationLagSeconds uint
+	ReplicationLagUnknown bool
 	SourceHost            string
 	SourcePort            int
 	ConnectRetry          int
diff --git a/go/vt/vttablet/tabletserver/repltracker/poller.go b/go/vt/vttablet/tabletserver/repltracker/poller.go
index 995223dc28d..5a9112be82b 100644
--- a/go/vt/vttablet/tabletserver/repltracker/poller.go
+++ b/go/vt/vttablet/tabletserver/repltracker/poller.go
@@ -50,7 +50,12 @@ func (p *poller) Status() (time.Duration, error) {
 		return 0, err
 	}
 
-	if !status.ReplicationRunning() {
+	// If replication is not currently running or we don't know what the lag is -- most commonly
+	// because the replica mysqld is in the process of trying to start replicating from its source
+	// but it hasn't yet reached the point where it can calculate the seconds_behind_master
+	// value and it's thus NULL -- then we will estimate the lag ourselves using the last seen
+	// value + the time elapsed since.
+	if !status.ReplicationRunning() || status.ReplicationLagUnknown {
 		if p.timeRecorded.IsZero() {
 			return 0, vterrors.Errorf(vtrpcpb.Code_UNAVAILABLE, "replication is not running")
 		}

From 31560fdb23c6170dcc16e07748c086596a3e72fc Mon Sep 17 00:00:00 2001
From: Matt Lord <mattalord@gmail.com>
Date: Thu, 2 Dec 2021 16:08:32 -0500
Subject: [PATCH 4/4] Update tablet healthcheck test based on new replica lag
 logic

Signed-off-by: Matt Lord <mattalord@gmail.com>
---
 go/mysql/replication_status.go                       | 2 +-
 go/test/endtoend/tabletmanager/main_test.go          | 7 ++++++-
 go/test/endtoend/tabletmanager/tablet_health_test.go | 8 ++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/go/mysql/replication_status.go b/go/mysql/replication_status.go
index 5decc99033e..2c818dcb537 100644
--- a/go/mysql/replication_status.go
+++ b/go/mysql/replication_status.go
@@ -31,7 +31,7 @@ type ReplicationStatus struct {
 	// However, some MySQL flavors don't expose this information,
 	// in which case RelayLogPosition.IsZero() will be true.
 	// If ReplicationLagUnknown is true then we should not rely on the seconds
-	// behind value and we can instead try to calcuate the lag ourselves when
+	// behind value and we can instead try to calculate the lag ourselves when
 	// appropriate.
 	RelayLogPosition      Position
 	FilePosition          Position
diff --git a/go/test/endtoend/tabletmanager/main_test.go b/go/test/endtoend/tabletmanager/main_test.go
index 5ac97fe4f20..9c118465edc 100644
--- a/go/test/endtoend/tabletmanager/main_test.go
+++ b/go/test/endtoend/tabletmanager/main_test.go
@@ -50,6 +50,7 @@ var (
 	username                         = "vt_dba"
 	cell                             = "zone1"
 	tabletHealthcheckRefreshInterval = 5 * time.Second
+	tabletUnhealthyThreshold         = tabletHealthcheckRefreshInterval * 2
 	sqlSchema                        = `
 	create table t1(
 		id bigint,
@@ -94,13 +95,17 @@ func TestMain(m *testing.M) {
 		}
 
 		// List of users authorized to execute vschema ddl operations
-		clusterInstance.VtGateExtraArgs = []string{"-vschema_ddl_authorized_users=%"}
+		clusterInstance.VtGateExtraArgs = []string{
+			"-vschema_ddl_authorized_users=%",
+			"-discovery_low_replication_lag", tabletUnhealthyThreshold.String(),
+		}
 		// Set extra tablet args for lock timeout
 		clusterInstance.VtTabletExtraArgs = []string{
 			"-lock_tables_timeout", "5s",
 			"-watch_replication_stream",
 			"-enable_replication_reporter",
 			"-health_check_interval", tabletHealthcheckRefreshInterval.String(),
+			"-unhealthy_threshold", tabletUnhealthyThreshold.String(),
 		}
 		// We do not need semiSync for this test case.
 		clusterInstance.EnableSemiSync = false
diff --git a/go/test/endtoend/tabletmanager/tablet_health_test.go b/go/test/endtoend/tabletmanager/tablet_health_test.go
index d7afdb44615..a27ab5a3d47 100644
--- a/go/test/endtoend/tabletmanager/tablet_health_test.go
+++ b/go/test/endtoend/tabletmanager/tablet_health_test.go
@@ -153,11 +153,12 @@ func TestHealthCheck(t *testing.T) {
 	}
 
 	// stop the replica's source mysqld instance to break replication
-	// and test that the tablet becomes unhealthy and non-serving
+	// and test that the replica tablet becomes unhealthy and non-serving after crossing
+	// the tablet's -unhealthy_threshold and the gateway's -discovery_low_replication_lag
 	err = primaryTablet.MysqlctlProcess.Stop()
 	require.NoError(t, err)
 
-	time.Sleep(tabletHealthcheckRefreshInterval)
+	time.Sleep(tabletUnhealthyThreshold + tabletHealthcheckRefreshInterval)
 
 	// now the replica's VtTabletStreamHealth should show it as unhealthy
 	result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias)
@@ -238,8 +239,7 @@ func verifyStreamHealth(t *testing.T, result string, expectHealthy bool) {
 		// replicationLagSeconds varies till 7200 so setting safe limit
 		assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary")
 	} else {
-		assert.False(t, serving, "Tablet should not be in serving state")
-		assert.False(t, replicationLagSeconds < 10000, "replica should be behind primary")
+		assert.True(t, (!serving || replicationLagSeconds >= uint32(tabletUnhealthyThreshold.Seconds())), "Tablet should not be in serving and healthy state")
 	}
 }