diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 4e6cf8abf16..8fe38a9791b 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -75,7 +75,7 @@ func TestErrantGTIDOnPreviousPrimary(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) var replica, otherReplica *cluster.Vttablet @@ -134,7 +134,7 @@ func TestSingleKeyspace(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } @@ -152,7 +152,7 @@ func TestKeyspaceShard(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } @@ -175,7 +175,7 @@ func TestVTOrcRepairs(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) var replica, otherReplica *cluster.Vttablet @@ -203,7 +203,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair match := utils.WaitForReadOnlyValue(t, curPrimary, 0) require.True(t, match) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) }) t.Run("ReplicaReadWrite", func(t *testing.T) { @@ -214,7 +214,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair match := utils.WaitForReadOnlyValue(t, replica, 1) require.True(t, match) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 1) }) t.Run("StopReplication", func(t *testing.T) { @@ -224,7 +224,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 2) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 2) // Stop just the IO thread on the replica _, err = utils.RunSQL(t, "STOP REPLICA IO_THREAD", replica, "") @@ -232,7 +232,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 3) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 3) // Stop just the SQL thread on the replica _, err = utils.RunSQL(t, "STOP REPLICA SQL_THREAD", replica, "") @@ -240,7 +240,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 4) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 4) }) t.Run("ReplicationFromOtherReplica", func(t *testing.T) { @@ -256,7 +256,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait until the source port is set back correctly by vtorc utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 5) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 5) // check that writes succeed utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) @@ -268,7 +268,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait until heart beat interval has been fixed by vtorc. utils.CheckHeartbeatInterval(t, replica, 16.5, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 6) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 6) // check that writes succeed utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) @@ -291,7 +291,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair err = utils.WaitForReplicationToStop(t, curPrimary) require.NoError(t, err) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) // check that the writes still succeed utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second) }) @@ -514,7 +514,7 @@ func TestVTOrcWithPrs(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find any replica tablet other than the current primary @@ -540,13 +540,13 @@ func TestVTOrcWithPrs(t *testing.T) { // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // Verify that VTOrc didn't run any other recovery - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second) } @@ -673,7 +673,7 @@ func TestFullStatusConnectionPooling(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // Kill the current primary. diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index de60420eee3..5c04b133242 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -52,7 +52,7 @@ func TestDownPrimary(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -99,7 +99,7 @@ func TestDownPrimary(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) t.Run("Check ERS and PRS Vars and Metrics", func(t *testing.T) { utils.CheckVarExists(t, vtOrcProcess, "EmergencyReparentCounts") @@ -164,7 +164,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) } @@ -178,7 +178,7 @@ func TestDeletedPrimaryTablet(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -229,7 +229,7 @@ func TestDeletedPrimaryTablet(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, keyspace.Name, shard0.Name, 1) } // TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead. @@ -249,7 +249,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -286,7 +286,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 2, false) // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // Parse log file and find out how much time it took for DeadPrimary to recover. diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 34ef326535b..4b2cac1f867 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -988,14 +988,15 @@ func WaitForReadOnlyValue(t *testing.T, curPrimary *cluster.Vttablet, expectValu } // WaitForSuccessfulRecoveryCount waits until the given recovery name's count of successful runs matches the count expected -func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, recoveryName string, countExpected int) { +func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, recoveryName, keyspace, shard string, countExpected int) { t.Helper() timeout := 15 * time.Second startTime := time.Now() + mapKey := fmt.Sprintf("%s.%s.%s", recoveryName, keyspace, shard) for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := GetIntFromValue(successfulRecoveriesMap[recoveryName]) + successCount := GetIntFromValue(successfulRecoveriesMap[mapKey]) if successCount == countExpected { return } diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 98cba60974e..7d5d633e4c9 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -51,14 +51,6 @@ const ( ) var ( - actionableRecoveriesNames = []string{ - RecoverDeadPrimaryRecoveryName, - RecoverPrimaryHasPrimaryRecoveryName, - ElectNewPrimaryRecoveryName, - FixPrimaryRecoveryName, - FixReplicaRecoveryName, - } - countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries") // detectedProblems is used to track the number of detected problems. @@ -75,14 +67,17 @@ var ( // shardsLockCounter is a count of in-flight shard locks. Use atomics to read/update. shardsLockCounter int64 + // recoveriesCounterLabels are labels for grouping the counter based stats for recoveries. + recoveriesCounterLabels = []string{"RecoveryType", "Keyspace", "Shard"} + // recoveriesCounter counts the number of recoveries that VTOrc has performed - recoveriesCounter = stats.NewCountersWithSingleLabel("RecoveriesCount", "Count of the different recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesCounter = stats.NewCountersWithMultiLabels("RecoveriesCount", "Count of the different recoveries performed", recoveriesCounterLabels) // recoveriesSuccessfulCounter counts the number of successful recoveries that VTOrc has performed - recoveriesSuccessfulCounter = stats.NewCountersWithSingleLabel("SuccessfulRecoveries", "Count of the different successful recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesSuccessfulCounter = stats.NewCountersWithMultiLabels("SuccessfulRecoveries", "Count of the different successful recoveries performed", recoveriesCounterLabels) // recoveriesFailureCounter counts the number of failed recoveries that VTOrc has performed - recoveriesFailureCounter = stats.NewCountersWithSingleLabel("FailedRecoveries", "Count of the different failed recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesFailureCounter = stats.NewCountersWithMultiLabels("FailedRecoveries", "Count of the different failed recoveries performed", recoveriesCounterLabels) // shardLockTimings measures the timing of LockShard operations. shardLockTimingsActions = []string{"Lock", "Unlock"} @@ -637,13 +632,14 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er return err } recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode) - recoveriesCounter.Add(recoveryName, 1) + recoveryLabels := []string{recoveryName, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard} + recoveriesCounter.Add(recoveryLabels, 1) if err != nil { logger.Errorf("Failed to recover: %+v", err) - recoveriesFailureCounter.Add(recoveryName, 1) + recoveriesFailureCounter.Add(recoveryLabels, 1) } else { logger.Info("Recovery succeeded") - recoveriesSuccessfulCounter.Add(recoveryName, 1) + recoveriesSuccessfulCounter.Add(recoveryLabels, 1) } if topologyRecovery == nil { logger.Error("Topology recovery is nil - recovery might have failed")