From 121a6c395ac232070c18a864189c7dccf81c1c11 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Tue, 27 May 2025 02:01:34 +0200 Subject: [PATCH 1/3] `vtorc`: add keyspace/shard labels to recoveries stats Signed-off-by: Tim Vaillancourt --- go/vt/vtorc/logic/topology_recovery.go | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index ace97999505..cdc0bc95642 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -56,14 +56,6 @@ const ( ) var ( - actionableRecoveriesNames = []string{ - RecoverDeadPrimaryRecoveryName, - RecoverPrimaryHasPrimaryRecoveryName, - ElectNewPrimaryRecoveryName, - FixPrimaryRecoveryName, - FixReplicaRecoveryName, - } - countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries") // detectedProblems is used to track the number of detected problems. @@ -80,14 +72,17 @@ var ( // shardsLockCounter is a count of in-flight shard locks. Use atomics to read/update. shardsLockCounter int64 + // recoveriesCounterLabels are labels for grouping the counter based stats for recoveries. + recoveriesCounterLabels = []string{"RecoveryType", "Keyspace", "Shard"} + // recoveriesCounter counts the number of recoveries that VTOrc has performed - recoveriesCounter = stats.NewCountersWithSingleLabel("RecoveriesCount", "Count of the different recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesCounter = stats.NewCountersWithMultiLabels("RecoveriesCount", "Count of the different recoveries performed", recoveriesCounterLabels) // recoveriesSuccessfulCounter counts the number of successful recoveries that VTOrc has performed - recoveriesSuccessfulCounter = stats.NewCountersWithSingleLabel("SuccessfulRecoveries", "Count of the different successful recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesSuccessfulCounter = stats.NewCountersWithMultiLabels("SuccessfulRecoveries", "Count of the different successful recoveries performed", recoveriesCounterLabels) // recoveriesFailureCounter counts the number of failed recoveries that VTOrc has performed - recoveriesFailureCounter = stats.NewCountersWithSingleLabel("FailedRecoveries", "Count of the different failed recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesFailureCounter = stats.NewCountersWithMultiLabels("FailedRecoveries", "Count of the different failed recoveries performed", recoveriesCounterLabels) // vtops vtopsExec = external.NewExecVTOps() @@ -793,17 +788,18 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er return err } recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode) - recoveriesCounter.Add(recoveryName, 1) + recoveryLabels := []string{recoveryName, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard} + recoveriesCounter.Add(recoveryLabels, 1) if err != nil { message := fmt.Sprintf("Recovery failed on %s (%s) for problem %s. Error: %s", analysisEntry.AnalyzedInstanceAlias, analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis, err.Error()) vtopsExec.SendSlackMessage(message, vtopsSlackChannel) logger.Errorf(message) - recoveriesFailureCounter.Add(recoveryName, 1) + recoveriesFailureCounter.Add(recoveryLabels, 1) } else { message := fmt.Sprintf("Recovery succeeded on %s (%s) for problem %s.", analysisEntry.AnalyzedInstanceAlias, analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis) logger.Info(message) vtopsExec.SendSlackMessage(message, vtopsSlackChannel) - recoveriesSuccessfulCounter.Add(recoveryName, 1) + recoveriesSuccessfulCounter.Add(recoveryLabels, 1) } if topologyRecovery == nil { logger.Error("Topology recovery is nil - recovery might have failed") From 6cfcf9f4cf611d7108da829937359a1ecd492d98 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Tue, 27 May 2025 20:31:52 +0200 Subject: [PATCH 2/3] add keyspace/shard to `utils.WaitForSuccessfulRecoveryCount` Signed-off-by: Tim Vaillancourt --- go/test/endtoend/vtorc/general/vtorc_test.go | 34 +++++++++---------- .../primaryfailure/primary_failure_test.go | 14 ++++---- go/test/endtoend/vtorc/utils/utils.go | 5 +-- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 60b1c7eb3dc..1a8441b22d8 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -76,7 +76,7 @@ func TestSingleKeyspace(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } @@ -95,7 +95,7 @@ func TestKeyspaceShard(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } @@ -119,7 +119,7 @@ func TestVTOrcRepairs(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) var replica, otherReplica *cluster.Vttablet @@ -147,7 +147,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair match := utils.WaitForReadOnlyValue(t, curPrimary, 0) require.True(t, match) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) }) t.Run("ReplicaReadWrite", func(t *testing.T) { @@ -158,7 +158,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair match := utils.WaitForReadOnlyValue(t, replica, 1) require.True(t, match) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 1) }) t.Run("StopReplication", func(t *testing.T) { @@ -168,7 +168,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 2) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 2) // Stop just the IO thread on the replica _, err = utils.RunSQL(t, "STOP SLAVE IO_THREAD", replica, "") @@ -176,7 +176,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 3) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 3) // Stop just the SQL thread on the replica _, err = utils.RunSQL(t, "STOP SLAVE SQL_THREAD", replica, "") @@ -184,7 +184,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 4) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 4) }) t.Run("ReplicationFromOtherReplica", func(t *testing.T) { @@ -200,7 +200,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait until the source port is set back correctly by vtorc utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 5) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 5) // check that writes succeed utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) @@ -223,7 +223,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair err = utils.WaitForReplicationToStop(t, curPrimary) require.NoError(t, err) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) // check that the writes still succeed utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second) }) @@ -388,7 +388,7 @@ func TestVTOrcWithPrs(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find any replica tablet other than the current primary @@ -416,13 +416,13 @@ func TestVTOrcWithPrs(t *testing.T) { // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // Verify that VTOrc didn't run any other recovery - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second) } @@ -512,7 +512,7 @@ func TestFullStatusConnectionPooling(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // Kill the current primary. diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 180f367d7fb..069371ea3d2 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -52,7 +52,7 @@ func TestDownPrimary(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -99,7 +99,7 @@ func TestDownPrimary(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) } @@ -155,7 +155,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) } @@ -170,7 +170,7 @@ func TestDeletedPrimaryTablet(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -221,7 +221,7 @@ func TestDeletedPrimaryTablet(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, keyspace.Name, shard0.Name, 1) } // TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead. @@ -242,7 +242,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -279,7 +279,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 2, false) // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // Parse log file and find out how much time it took for DeadPrimary to recover. diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index e923e86e825..36ca8a8c422 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -952,14 +952,15 @@ func WaitForReadOnlyValue(t *testing.T, curPrimary *cluster.Vttablet, expectValu } // WaitForSuccessfulRecoveryCount waits until the given recovery name's count of successful runs matches the count expected -func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, recoveryName string, countExpected int) { +func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, recoveryName, keyspace, shard string, countExpected int) { t.Helper() timeout := 15 * time.Second startTime := time.Now() for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) + successCountKey := recoveryName + "." + keyspace + "." + shard + successCount := getIntFromValue(successfulRecoveriesMap[successCountKey]) if successCount == countExpected { return } From 7cffba7bc0e6f16b63d2eae0c614f05190a30132 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Tue, 27 May 2025 20:37:40 +0200 Subject: [PATCH 3/3] make var once Signed-off-by: Tim Vaillancourt --- go/test/endtoend/vtorc/utils/utils.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 36ca8a8c422..6c542aa0b54 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -956,11 +956,11 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr t.Helper() timeout := 15 * time.Second startTime := time.Now() + mapKey := fmt.Sprintf("%s.%s.%s", recoveryName, keyspace, shard) for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCountKey := recoveryName + "." + keyspace + "." + shard - successCount := getIntFromValue(successfulRecoveriesMap[successCountKey]) + successCount := getIntFromValue(successfulRecoveriesMap[mapKey]) if successCount == countExpected { return }