From 9d1d6426ef6d3ee9108df9db4ef45d270c057176 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 13:24:14 -0500 Subject: [PATCH 1/3] Cherry-pick 24b0579d22482cef549439fb7ca56ab9c49fadb2 with conflicts --- go/test/endtoend/tabletgateway/vtgate_test.go | 75 ++++++++++++++++++- go/vt/vtgate/executor.go | 32 ++++++-- go/vt/vtgate/executor_test.go | 15 ++++ .../tabletserver/repltracker/poller.go | 4 +- .../tabletserver/repltracker/repltracker.go | 5 +- 5 files changed, 120 insertions(+), 11 deletions(-) diff --git a/go/test/endtoend/tabletgateway/vtgate_test.go b/go/test/endtoend/tabletgateway/vtgate_test.go index a3876b259f3..ffeb34f315d 100644 --- a/go/test/endtoend/tabletgateway/vtgate_test.go +++ b/go/test/endtoend/tabletgateway/vtgate_test.go @@ -28,14 +28,19 @@ import ( "testing" "time" +<<<<<<< HEAD "vitess.io/vitess/go/test/endtoend/utils" +======= +>>>>>>> 24b0579d22 (SHOW VITESS_REPLICATION_STATUS: Only use replication tracker when it's enabled (#15348)) "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "vitess.io/vitess/go/mysql" - "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/utils" + vtorcutils "vitess.io/vitess/go/test/endtoend/vtorc/utils" + "vitess.io/vitess/go/vt/proto/topodata" ) func TestVtgateHealthCheck(t *testing.T) { @@ -53,6 +58,57 @@ func TestVtgateHealthCheck(t *testing.T) { } func TestVtgateReplicationStatusCheck(t *testing.T) { + defer cluster.PanicHandler(t) + // Healthcheck interval on tablet is set to 1s, so sleep for 2s + time.Sleep(2 * time.Second) + verifyVtgateVariables(t, clusterInstance.VtgateProcess.VerifyURL) + ctx := context.Background() + conn, err := mysql.Connect(ctx, &vtParams) // VTGate + require.NoError(t, err) + defer conn.Close() + + // Only returns rows for REPLICA and RDONLY tablets -- so should be 2 of them + qr := utils.Exec(t, conn, "show vitess_replication_status like '%'") + expectNumRows := 2 + numRows := len(qr.Rows) + assert.Equal(t, expectNumRows, numRows, fmt.Sprintf("wrong number of results from show vitess_replication_status. Expected %d, got %d", expectNumRows, numRows)) + + // Disable VTOrc(s) recoveries so that it doesn't immediately repair/restart replication. + for _, vtorcProcess := range clusterInstance.VTOrcProcesses { + vtorcutils.DisableGlobalRecoveries(t, vtorcProcess) + } + // Re-enable recoveries afterward as the cluster is re-used. + defer func() { + for _, vtorcProcess := range clusterInstance.VTOrcProcesses { + vtorcutils.EnableGlobalRecoveries(t, vtorcProcess) + } + }() + // Stop replication on the non-PRIMARY tablets. + _, err = clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("ExecuteFetchAsDBA", clusterInstance.Keyspaces[0].Shards[0].Replica().Alias, "stop slave") + require.NoError(t, err) + _, err = clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("ExecuteFetchAsDBA", clusterInstance.Keyspaces[0].Shards[0].Rdonly().Alias, "stop slave") + require.NoError(t, err) + // Restart replication afterward as the cluster is re-used. + defer func() { + _, err = clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("ExecuteFetchAsDBA", clusterInstance.Keyspaces[0].Shards[0].Replica().Alias, "start slave") + require.NoError(t, err) + _, err = clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("ExecuteFetchAsDBA", clusterInstance.Keyspaces[0].Shards[0].Rdonly().Alias, "start slave") + require.NoError(t, err) + }() + time.Sleep(2 * time.Second) // Build up some replication lag + res, err := conn.ExecuteFetch("show vitess_replication_status", 2, false) + require.NoError(t, err) + expectNumRows = 2 + numRows = len(qr.Rows) + assert.Equal(t, expectNumRows, numRows, fmt.Sprintf("wrong number of results from show vitess_replication_status, expected %d, got %d", expectNumRows, numRows)) + rawLag := res.Named().Rows[0]["ReplicationLag"] // Let's just look at the first row + lagInt, _ := rawLag.ToInt64() // Don't check the error as the value could be "NULL" + assert.True(t, rawLag.IsNull() || lagInt > 0, "replication lag should be NULL or greater than 0 but was: %s", rawLag.ToString()) +} + +<<<<<<< HEAD +======= +func TestVtgateReplicationStatusCheckWithTabletTypeChange(t *testing.T) { defer cluster.PanicHandler(t) // Healthcheck interval on tablet is set to 1s, so sleep for 2s time.Sleep(2 * time.Second) @@ -67,8 +123,25 @@ func TestVtgateReplicationStatusCheck(t *testing.T) { expectNumRows := 2 numRows := len(qr.Rows) assert.Equal(t, expectNumRows, numRows, fmt.Sprintf("wrong number of results from show vitess_replication_status. Expected %d, got %d", expectNumRows, numRows)) + + // change the RDONLY tablet to SPARE + rdOnlyTablet := clusterInstance.Keyspaces[0].Shards[0].Rdonly() + err = clusterInstance.VtctlclientChangeTabletType(rdOnlyTablet, topodata.TabletType_SPARE) + require.NoError(t, err) + // Change it back to RDONLY afterward as the cluster is re-used. + defer func() { + err = clusterInstance.VtctlclientChangeTabletType(rdOnlyTablet, topodata.TabletType_RDONLY) + require.NoError(t, err) + }() + + // Only returns rows for REPLICA and RDONLY tablets -- so should be 1 of them since we updated 1 to spare + qr = utils.Exec(t, conn, "show vitess_replication_status like '%'") + expectNumRows = 1 + numRows = len(qr.Rows) + assert.Equal(t, expectNumRows, numRows, fmt.Sprintf("wrong number of results from show vitess_replication_status. Expected %d, got %d", expectNumRows, numRows)) } +>>>>>>> 24b0579d22 (SHOW VITESS_REPLICATION_STATUS: Only use replication tracker when it's enabled (#15348)) func verifyVtgateVariables(t *testing.T, url string) { resp, err := http.Get(url) require.NoError(t, err) diff --git a/go/vt/vtgate/executor.go b/go/vt/vtgate/executor.go index 0927ea7e391..4de224802ca 100644 --- a/go/vt/vtgate/executor.go +++ b/go/vt/vtgate/executor.go @@ -868,7 +868,7 @@ func (e *Executor) showVitessReplicationStatus(ctx context.Context, filter *sqlp tabletHostPort := ts.GetTabletHostPort() throttlerStatus, err := getTabletThrottlerStatus(tabletHostPort) if err != nil { - log.Warningf("Could not get throttler status from %s: %v", tabletHostPort, err) + log.Warningf("Could not get throttler status from %s: %v", topoproto.TabletAliasString(ts.Tablet.Alias), err) } replSourceHost := "" @@ -876,7 +876,7 @@ func (e *Executor) showVitessReplicationStatus(ctx context.Context, filter *sqlp replIOThreadHealth := "" replSQLThreadHealth := "" replLastError := "" - replLag := int64(-1) + replLag := "-1" // A string to support NULL as a value sql := "show slave status" results, err := e.txConn.tabletGateway.Execute(ctx, ts.Target, sql, nil, 0, 0, nil) if err != nil || results == nil { @@ -887,8 +887,25 @@ func (e *Executor) showVitessReplicationStatus(ctx context.Context, filter *sqlp replIOThreadHealth = row["Slave_IO_Running"].ToString() replSQLThreadHealth = row["Slave_SQL_Running"].ToString() replLastError = row["Last_Error"].ToString() - if ts.Stats != nil { - replLag = int64(ts.Stats.ReplicationLagSeconds) + // We cannot check the tablet's tabletenv config from here so + // we only use the tablet's stat -- which is managed by the + // ReplicationTracker -- if we can tell that it's enabled, + // meaning that it has a non-zero value. If it's actually + // enabled AND zero (rather than the zeroval), then mysqld + // should also return 0 so in this case the value is correct + // and equivalent either way. The only reason that we would + // want to use the ReplicationTracker based value, when we + // can, is because the polling method allows us to get the + // estimated lag value when replication is not running (based + // on how long we've seen that it's not been running). + if ts.Stats != nil && ts.Stats.ReplicationLagSeconds > 0 { // Use the value we get from the ReplicationTracker + replLag = fmt.Sprintf("%d", ts.Stats.ReplicationLagSeconds) + } else { // Use the value from mysqld + if row["Seconds_Behind_Master"].IsNull() { + replLag = strings.ToUpper(sqltypes.NullStr) // Uppercase to match mysqld's output in SHOW REPLICA STATUS + } else { + replLag = row["Seconds_Behind_Master"].ToString() + } } } replicationHealth := fmt.Sprintf("{\"EventStreamRunning\":\"%s\",\"EventApplierRunning\":\"%s\",\"LastError\":\"%s\"}", replIOThreadHealth, replSQLThreadHealth, replLastError) @@ -901,7 +918,7 @@ func (e *Executor) showVitessReplicationStatus(ctx context.Context, filter *sqlp ts.Tablet.Hostname, fmt.Sprintf("%s:%d", replSourceHost, replSourcePort), replicationHealth, - fmt.Sprintf("%d", replLag), + replLag, throttlerStatus, )) } @@ -1401,11 +1418,14 @@ func (e *Executor) checkThatPlanIsValid(stmt sqlparser.Statement, plan *engine.P return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "plan includes scatter, which is disallowed using the `no_scatter` command line argument") } +// getTabletThrottlerStatus uses HTTP to get the throttler status +// on a tablet. It uses HTTP because the CheckThrottler RPC is a +// tmclient RPC and you cannot use tmclient outside of a tablet. func getTabletThrottlerStatus(tabletHostPort string) (string, error) { client := http.Client{ Timeout: 100 * time.Millisecond, } - resp, err := client.Get(fmt.Sprintf("http://%s/throttler/check?app=vtgate", tabletHostPort)) + resp, err := client.Get(fmt.Sprintf("http://%s/throttler/check-self", tabletHostPort)) if err != nil { return "", err } diff --git a/go/vt/vtgate/executor_test.go b/go/vt/vtgate/executor_test.go index 779cba8344b..41bc148947e 100644 --- a/go/vt/vtgate/executor_test.go +++ b/go/vt/vtgate/executor_test.go @@ -47,18 +47,33 @@ import ( "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/callerid" +<<<<<<< HEAD "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/vtgate/vindexes" "vitess.io/vitess/go/vt/vtgate/vschemaacl" +======= + "vitess.io/vitess/go/vt/discovery" + "vitess.io/vitess/go/vt/sqlparser" + "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/vtgate/buffer" + "vitess.io/vitess/go/vt/vtgate/engine" + "vitess.io/vitess/go/vt/vtgate/logstats" + "vitess.io/vitess/go/vt/vtgate/vindexes" + "vitess.io/vitess/go/vt/vtgate/vschemaacl" + "vitess.io/vitess/go/vt/vtgate/vtgateservice" +>>>>>>> 24b0579d22 (SHOW VITESS_REPLICATION_STATUS: Only use replication tracker when it's enabled (#15348)) querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" vschemapb "vitess.io/vitess/go/vt/proto/vschema" vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" +<<<<<<< HEAD "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" +======= +>>>>>>> 24b0579d22 (SHOW VITESS_REPLICATION_STATUS: Only use replication tracker when it's enabled (#15348)) ) func TestExecutorResultsExceeded(t *testing.T) { diff --git a/go/vt/vttablet/tabletserver/repltracker/poller.go b/go/vt/vttablet/tabletserver/repltracker/poller.go index ace01dffb2d..6fc964bef57 100644 --- a/go/vt/vttablet/tabletserver/repltracker/poller.go +++ b/go/vt/vttablet/tabletserver/repltracker/poller.go @@ -21,10 +21,10 @@ import ( "time" "vitess.io/vitess/go/stats" - "vitess.io/vitess/go/vt/mysqlctl" - vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" "vitess.io/vitess/go/vt/vterrors" + + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" ) var replicationLagSeconds = stats.NewGauge("replicationLagSec", "replication lag in seconds") diff --git a/go/vt/vttablet/tabletserver/repltracker/repltracker.go b/go/vt/vttablet/tabletserver/repltracker/repltracker.go index 3d6359ed902..0be7cc9bc65 100644 --- a/go/vt/vttablet/tabletserver/repltracker/repltracker.go +++ b/go/vt/vttablet/tabletserver/repltracker/repltracker.go @@ -23,10 +23,11 @@ import ( "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/mysqlctl" - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/vttablet/tabletserver/heartbeat" "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" + + querypb "vitess.io/vitess/go/vt/proto/query" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) var ( From 0305048d2f540dd23def89040eb52b7835f67ffd Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Mon, 26 Feb 2024 13:56:06 -0500 Subject: [PATCH 2/3] Adjust for v17 Signed-off-by: Matt Lord --- go/test/endtoend/tabletgateway/vtgate_test.go | 42 ------------------- go/test/endtoend/vtorc/utils/utils.go | 16 +++++++ go/vt/vtgate/executor_test.go | 32 +++----------- 3 files changed, 21 insertions(+), 69 deletions(-) diff --git a/go/test/endtoend/tabletgateway/vtgate_test.go b/go/test/endtoend/tabletgateway/vtgate_test.go index ffeb34f315d..9a26888647a 100644 --- a/go/test/endtoend/tabletgateway/vtgate_test.go +++ b/go/test/endtoend/tabletgateway/vtgate_test.go @@ -28,11 +28,6 @@ import ( "testing" "time" -<<<<<<< HEAD - "vitess.io/vitess/go/test/endtoend/utils" - -======= ->>>>>>> 24b0579d22 (SHOW VITESS_REPLICATION_STATUS: Only use replication tracker when it's enabled (#15348)) "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -40,7 +35,6 @@ import ( "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/utils" vtorcutils "vitess.io/vitess/go/test/endtoend/vtorc/utils" - "vitess.io/vitess/go/vt/proto/topodata" ) func TestVtgateHealthCheck(t *testing.T) { @@ -106,42 +100,6 @@ func TestVtgateReplicationStatusCheck(t *testing.T) { assert.True(t, rawLag.IsNull() || lagInt > 0, "replication lag should be NULL or greater than 0 but was: %s", rawLag.ToString()) } -<<<<<<< HEAD -======= -func TestVtgateReplicationStatusCheckWithTabletTypeChange(t *testing.T) { - defer cluster.PanicHandler(t) - // Healthcheck interval on tablet is set to 1s, so sleep for 2s - time.Sleep(2 * time.Second) - verifyVtgateVariables(t, clusterInstance.VtgateProcess.VerifyURL) - ctx := context.Background() - conn, err := mysql.Connect(ctx, &vtParams) - require.NoError(t, err) - defer conn.Close() - - // Only returns rows for REPLICA and RDONLY tablets -- so should be 2 of them - qr := utils.Exec(t, conn, "show vitess_replication_status like '%'") - expectNumRows := 2 - numRows := len(qr.Rows) - assert.Equal(t, expectNumRows, numRows, fmt.Sprintf("wrong number of results from show vitess_replication_status. Expected %d, got %d", expectNumRows, numRows)) - - // change the RDONLY tablet to SPARE - rdOnlyTablet := clusterInstance.Keyspaces[0].Shards[0].Rdonly() - err = clusterInstance.VtctlclientChangeTabletType(rdOnlyTablet, topodata.TabletType_SPARE) - require.NoError(t, err) - // Change it back to RDONLY afterward as the cluster is re-used. - defer func() { - err = clusterInstance.VtctlclientChangeTabletType(rdOnlyTablet, topodata.TabletType_RDONLY) - require.NoError(t, err) - }() - - // Only returns rows for REPLICA and RDONLY tablets -- so should be 1 of them since we updated 1 to spare - qr = utils.Exec(t, conn, "show vitess_replication_status like '%'") - expectNumRows = 1 - numRows = len(qr.Rows) - assert.Equal(t, expectNumRows, numRows, fmt.Sprintf("wrong number of results from show vitess_replication_status. Expected %d, got %d", expectNumRows, numRows)) -} - ->>>>>>> 24b0579d22 (SHOW VITESS_REPLICATION_STATUS: Only use replication tracker when it's enabled (#15348)) func verifyVtgateVariables(t *testing.T, url string) { resp, err := http.Get(url) require.NoError(t, err) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 3f228d0e041..7d22e60b410 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -1020,3 +1020,19 @@ func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcess log.Errorf("%s", string(content)) } } + +// EnableGlobalRecoveries enables global recoveries for the given VTOrc. +func EnableGlobalRecoveries(t *testing.T, vtorc *cluster.VTOrcProcess) { + status, resp, err := MakeAPICall(t, vtorc, "/api/enable-global-recoveries") + require.NoError(t, err) + assert.Equal(t, 200, status) + assert.Equal(t, "Global recoveries enabled\n", resp) +} + +// DisableGlobalRecoveries disables global recoveries for the given VTOrc. +func DisableGlobalRecoveries(t *testing.T, vtorc *cluster.VTOrcProcess) { + status, resp, err := MakeAPICall(t, vtorc, "/api/disable-global-recoveries") + require.NoError(t, err) + assert.Equal(t, 200, status) + assert.Equal(t, "Global recoveries disabled\n", resp) +} diff --git a/go/vt/vtgate/executor_test.go b/go/vt/vtgate/executor_test.go index 41bc148947e..fa7cd8ab422 100644 --- a/go/vt/vtgate/executor_test.go +++ b/go/vt/vtgate/executor_test.go @@ -28,52 +28,30 @@ import ( "strings" "testing" + "github.com/google/go-cmp/cmp" "github.com/google/safehtml/template" - - "vitess.io/vitess/go/vt/vtgate/logstats" - + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "google.golang.org/protobuf/proto" - "vitess.io/vitess/go/mysql/collations" - "vitess.io/vitess/go/cache" - "vitess.io/vitess/go/test/utils" - "vitess.io/vitess/go/vt/vtgate/engine" - - "vitess.io/vitess/go/vt/topo" - - "github.com/google/go-cmp/cmp" - "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/test/utils" "vitess.io/vitess/go/vt/callerid" -<<<<<<< HEAD - "vitess.io/vitess/go/vt/sqlparser" - "vitess.io/vitess/go/vt/vtgate/vindexes" - "vitess.io/vitess/go/vt/vtgate/vschemaacl" -======= - "vitess.io/vitess/go/vt/discovery" "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/vtgate/buffer" "vitess.io/vitess/go/vt/vtgate/engine" "vitess.io/vitess/go/vt/vtgate/logstats" "vitess.io/vitess/go/vt/vtgate/vindexes" "vitess.io/vitess/go/vt/vtgate/vschemaacl" - "vitess.io/vitess/go/vt/vtgate/vtgateservice" ->>>>>>> 24b0579d22 (SHOW VITESS_REPLICATION_STATUS: Only use replication tracker when it's enabled (#15348)) querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" vschemapb "vitess.io/vitess/go/vt/proto/vschema" vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" -<<<<<<< HEAD - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -======= ->>>>>>> 24b0579d22 (SHOW VITESS_REPLICATION_STATUS: Only use replication tracker when it's enabled (#15348)) ) func TestExecutorResultsExceeded(t *testing.T) { From 4110c2e3f97da6a0fc9907f2dbe05996d06fe30f Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Mon, 26 Feb 2024 14:01:33 -0500 Subject: [PATCH 3/3] Kick CI Signed-off-by: Matt Lord