diff --git a/changelog/23.0/23.0.0/summary.md b/changelog/23.0/23.0.0/summary.md index db83795372e..d8cd8f8a2c7 100644 --- a/changelog/23.0/23.0.0/summary.md +++ b/changelog/23.0/23.0.0/summary.md @@ -9,6 +9,8 @@ - [VTGate](#new-vtgate-metrics) - **[Topology](#minor-changes-topo)** - [`--consul_auth_static_file` requires 1 or more credentials](#consul_auth_static_file-check-creds) + - **[VTOrc](#minor-changes-vtorc)** + - [Recovery stats to include keyspace/shard](#recoveries-stats-keyspace-shard) - **[VTTablet](#minor-changes-vttablet)** - [CLI Flags](#flags-vttablet) - [Managed MySQL configuration defaults to caching-sha2-password](#mysql-caching-sha2-password) @@ -40,6 +42,18 @@ The `--consul_auth_static_file` flag used in several components now requires that 1 or more credentials can be loaded from the provided json file. +### VTOrc + +#### Recovery stats to include keyspace/shard + +The following recovery-related stats now include labels for keyspaces and shards: +1. `FailedRecoveries` +2. `PendingRecoveries` +3. `RecoveriesCount` +4. `SuccessfulRecoveries` + +Previous to this release, only the recovery "type" was included in labels. + ### VTTablet #### CLI Flags diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 844085a007d..e9e48dfd2fa 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -76,7 +76,7 @@ func TestErrantGTIDOnPreviousPrimary(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) var replica, otherReplica *cluster.Vttablet @@ -135,7 +135,7 @@ func TestSingleKeyspace(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } @@ -153,7 +153,7 @@ func TestKeyspaceShard(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } @@ -176,7 +176,7 @@ func TestVTOrcRepairs(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) var replica, otherReplica *cluster.Vttablet @@ -204,7 +204,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair match := utils.WaitForReadOnlyValue(t, curPrimary, 0) require.True(t, match) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) }) t.Run("ReplicaReadWrite", func(t *testing.T) { @@ -215,7 +215,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair match := utils.WaitForReadOnlyValue(t, replica, 1) require.True(t, match) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 1) }) t.Run("StopReplication", func(t *testing.T) { @@ -225,7 +225,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 2) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 2) // Stop just the IO thread on the replica _, err = utils.RunSQL(t, "STOP REPLICA IO_THREAD", replica, "") @@ -233,7 +233,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 3) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 3) // Stop just the SQL thread on the replica _, err = utils.RunSQL(t, "STOP REPLICA SQL_THREAD", replica, "") @@ -241,7 +241,7 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 4) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 4) }) t.Run("ReplicationFromOtherReplica", func(t *testing.T) { @@ -257,7 +257,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait until the source port is set back correctly by vtorc utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 5) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 5) // check that writes succeed utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) @@ -269,7 +269,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait until heart beat interval has been fixed by vtorc. utils.CheckHeartbeatInterval(t, replica, 16.5, 15*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 6) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 6) // check that writes succeed utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) @@ -292,7 +292,7 @@ func TestVTOrcRepairs(t *testing.T) { // wait for repair err = utils.WaitForReplicationToStop(t, curPrimary) require.NoError(t, err) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) // check that the writes still succeed utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second) }) @@ -515,7 +515,7 @@ func TestVTOrcWithPrs(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find any replica tablet other than the current primary @@ -541,13 +541,13 @@ func TestVTOrcWithPrs(t *testing.T) { // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // Verify that VTOrc didn't run any other recovery - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, keyspace.Name, shard0.Name, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, keyspace.Name, shard0.Name, 0) utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second) } @@ -674,7 +674,7 @@ func TestFullStatusConnectionPooling(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // Kill the current primary. diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 761ce35e66e..3b2c9c33ab5 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -53,7 +53,7 @@ func TestDownPrimary(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -100,7 +100,7 @@ func TestDownPrimary(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) t.Run("Check ERS and PRS Vars and Metrics", func(t *testing.T) { utils.CheckVarExists(t, vtOrcProcess, "EmergencyReparentCounts") @@ -165,7 +165,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) } @@ -179,7 +179,7 @@ func TestDeletedPrimaryTablet(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -230,7 +230,7 @@ func TestDeletedPrimaryTablet(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, keyspace.Name, shard0.Name, 1) } // TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead. @@ -250,7 +250,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets @@ -287,7 +287,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 2, false) // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // Parse log file and find out how much time it took for DeadPrimary to recover. diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index caca923b1e2..e19e1e0b191 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -989,14 +989,15 @@ func WaitForReadOnlyValue(t *testing.T, curPrimary *cluster.Vttablet, expectValu } // WaitForSuccessfulRecoveryCount waits until the given recovery name's count of successful runs matches the count expected -func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, recoveryName string, countExpected int) { +func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, recoveryName, keyspace, shard string, countExpected int) { t.Helper() timeout := 15 * time.Second startTime := time.Now() + mapKey := fmt.Sprintf("%s.%s.%s", recoveryName, keyspace, shard) for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := GetIntFromValue(successfulRecoveriesMap[recoveryName]) + successCount := GetIntFromValue(successfulRecoveriesMap[mapKey]) if successCount == countExpected { return } diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 25b79e1f1bc..b4cbfa5cc3a 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -51,14 +51,6 @@ const ( ) var ( - actionableRecoveriesNames = []string{ - RecoverDeadPrimaryRecoveryName, - RecoverPrimaryHasPrimaryRecoveryName, - ElectNewPrimaryRecoveryName, - FixPrimaryRecoveryName, - FixReplicaRecoveryName, - } - countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries") // detectedProblems is used to track the number of detected problems. @@ -75,14 +67,17 @@ var ( // shardsLockCounter is a count of in-flight shard locks. Use atomics to read/update. shardsLockCounter int64 + // recoveriesCounterLabels are labels for grouping the counter based stats for recoveries. + recoveriesCounterLabels = []string{"RecoveryType", "Keyspace", "Shard"} + // recoveriesCounter counts the number of recoveries that VTOrc has performed - recoveriesCounter = stats.NewCountersWithSingleLabel("RecoveriesCount", "Count of the different recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesCounter = stats.NewCountersWithMultiLabels("RecoveriesCount", "Count of the different recoveries performed", recoveriesCounterLabels) // recoveriesSuccessfulCounter counts the number of successful recoveries that VTOrc has performed - recoveriesSuccessfulCounter = stats.NewCountersWithSingleLabel("SuccessfulRecoveries", "Count of the different successful recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesSuccessfulCounter = stats.NewCountersWithMultiLabels("SuccessfulRecoveries", "Count of the different successful recoveries performed", recoveriesCounterLabels) // recoveriesFailureCounter counts the number of failed recoveries that VTOrc has performed - recoveriesFailureCounter = stats.NewCountersWithSingleLabel("FailedRecoveries", "Count of the different failed recoveries performed", "RecoveryType", actionableRecoveriesNames...) + recoveriesFailureCounter = stats.NewCountersWithMultiLabels("FailedRecoveries", "Count of the different failed recoveries performed", recoveriesCounterLabels) // shardLockTimings measures the timing of LockShard operations. shardLockTimingsActions = []string{"Lock", "Unlock"} @@ -651,13 +646,14 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er return err } recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode) - recoveriesCounter.Add(recoveryName, 1) + recoveryLabels := []string{recoveryName, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard} + recoveriesCounter.Add(recoveryLabels, 1) if err != nil { logger.Errorf("Failed to recover: %+v", err) - recoveriesFailureCounter.Add(recoveryName, 1) + recoveriesFailureCounter.Add(recoveryLabels, 1) } else { logger.Info("Recovery succeeded") - recoveriesSuccessfulCounter.Add(recoveryName, 1) + recoveriesSuccessfulCounter.Add(recoveryLabels, 1) } if topologyRecovery == nil { logger.Error("Topology recovery is nil - recovery might have failed")