diff --git a/go/test/endtoend/backup/vtbackup/main_test.go b/go/test/endtoend/backup/vtbackup/main_test.go index fd63569f380..4a6e10e6c5a 100644 --- a/go/test/endtoend/backup/vtbackup/main_test.go +++ b/go/test/endtoend/backup/vtbackup/main_test.go @@ -43,13 +43,7 @@ var ( dbPassword = "VtDbaPass" shardKsName = fmt.Sprintf("%s/%s", keyspaceName, shardName) dbCredentialFile string - commonTabletArg = []string{ - vtutils.GetFlagVariantForTests("--vreplication-retry-delay"), "1s", - vtutils.GetFlagVariantForTests("--degraded-threshold"), "5s", - vtutils.GetFlagVariantForTests("--lock-tables-timeout"), "5s", - vtutils.GetFlagVariantForTests("--watch-replication-stream"), - vtutils.GetFlagVariantForTests("--enable-replication-reporter"), - vtutils.GetFlagVariantForTests("--serving-state-grace-period"), "1s"} + commonTabletArg []string ) func TestMain(m *testing.M) { @@ -59,6 +53,16 @@ func TestMain(m *testing.M) { localCluster = cluster.NewCluster(cell, hostname) defer localCluster.Teardown() + vttabletVer := localCluster.VtTabletMajorVersion + commonTabletArg = []string{ + vtutils.GetFlagVariantForTestsByVersion("--vreplication-retry-delay", vttabletVer), "1s", + vtutils.GetFlagVariantForTestsByVersion("--degraded-threshold", vttabletVer), "5s", + vtutils.GetFlagVariantForTestsByVersion("--lock-tables-timeout", vttabletVer), "5s", + vtutils.GetFlagVariantForTestsByVersion("--watch-replication-stream", vttabletVer), + vtutils.GetFlagVariantForTestsByVersion("--enable-replication-reporter", vttabletVer), + vtutils.GetFlagVariantForTestsByVersion("--serving-state-grace-period", vttabletVer), "1s", + } + // Start topo server err := localCluster.StartTopo() if err != nil { diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index beca801497f..dcb552f80b5 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -60,9 +60,10 @@ import ( // DefaultCell : If no cell name is passed, then use following const ( - DefaultCell = "zone1" - DefaultStartPort = 6700 - DefaultVttestEnv = "VTTEST=endtoend" + DefaultCell = "zone1" + DefaultStartPort = 6700 + DefaultVttestEnv = "VTTEST=endtoend" + DefaultVtorcsByCell = 1 ) var ( @@ -298,7 +299,6 @@ func (cluster *LocalProcessCluster) StartUnshardedKeyspace(keyspace Keyspace, re } func (cluster *LocalProcessCluster) startPartialKeyspace(keyspace Keyspace, shardNames []string, movedShard string, replicaCount int, rdonly bool, customizers ...any) (err error) { - cluster.HasPartialKeyspaces = true routedKeyspace := &Keyspace{ Name: fmt.Sprintf("%s_routed", keyspace.Name), @@ -806,7 +806,7 @@ func NewBareCluster(cell string, hostname string) *LocalProcessCluster { // path/to/whatever exists cluster.ReusingVTDATAROOT = true } else { - err = createDirectory(cluster.CurrentVTDATAROOT, 0700) + err = createDirectory(cluster.CurrentVTDATAROOT, 0o700) if err != nil { log.Fatal(err) } @@ -1160,7 +1160,8 @@ func (cluster *LocalProcessCluster) waitForMySQLProcessToExit(mysqlctlProcessLis // StartVtbackup starts a vtbackup func (cluster *LocalProcessCluster) StartVtbackup(newInitDBFile string, initialBackup bool, - keyspace string, shard string, cell string, extraArgs ...string) error { + keyspace string, shard string, cell string, extraArgs ...string, +) error { log.Info("Starting vtbackup") cluster.VtbackupProcess = *VtbackupProcessInstance( cluster.GetAndReserveTabletUID(), @@ -1175,7 +1176,6 @@ func (cluster *LocalProcessCluster) StartVtbackup(newInitDBFile string, initialB initialBackup) cluster.VtbackupProcess.ExtraArgs = extraArgs return cluster.VtbackupProcess.Setup() - } // GetAndReservePort gives port for required process @@ -1191,7 +1191,6 @@ func (cluster *LocalProcessCluster) GetAndReservePort() int { cluster.nextPortForProcess = cluster.nextPortForProcess + 1 log.Infof("Attempting to reserve port: %v", cluster.nextPortForProcess) ln, err := net.Listen("tcp", net.JoinHostPort("127.0.0.1", strconv.Itoa(cluster.nextPortForProcess))) - if err != nil { log.Errorf("Can't listen on port %v: %s, trying next port", cluster.nextPortForProcess, err) continue @@ -1214,7 +1213,7 @@ const portFileTimeout = 1 * time.Hour // If yes, then return that port, and save port + 200 in the same file // here, assumptions is 200 ports might be consumed for all tests in a package func getPort() int { - portFile, err := os.OpenFile(path.Join(os.TempDir(), "endtoend.port"), os.O_CREATE|os.O_RDWR, 0644) + portFile, err := os.OpenFile(path.Join(os.TempDir(), "endtoend.port"), os.O_CREATE|os.O_RDWR, 0o644) if err != nil { panic(err) } diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index b68f0aceb0f..4cf8bc0732c 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -18,7 +18,9 @@ package general import ( "context" + "encoding/json" "fmt" + "strconv" "testing" "time" @@ -893,3 +895,85 @@ func TestFullStatusConnectionPooling(t *testing.T) { assert.Equal(t, 200, status) assert.Equal(t, "null", resp) } + +// TestSemiSyncRecoveryOrdering verifies that when the durability policy changes +// to semi_sync, VTOrc fixes ReplicaSemiSyncMustBeSet before PrimarySemiSyncMustBeSet. +// This ordering is enforced by the AfterAnalyses/BeforeAnalyses dependencies. +func TestSemiSyncRecoveryOrdering(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) + // Start with durability "none" so no semi-sync is required initially. + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{ + PreventCrossCellFailover: true, + }, cluster.DefaultVtorcsByCell, policy.DurabilityNone) + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // Wait for primary election and healthy replication. + primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, primary, "should have elected a primary") + utils.CheckReplication(t, clusterInfo, primary, shard0.Vttablets, 10*time.Second) + + vtorc := clusterInfo.ClusterInstance.VTOrcProcesses[0] + utils.WaitForSuccessfulRecoveryCount(t, vtorc, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1) + + // Change durability to semi_sync. VTOrc should detect that replicas and primary + // need semi-sync enabled, and fix them in the correct order. + out, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput( + "SetKeyspaceDurabilityPolicy", keyspace.Name, "--durability-policy="+policy.DurabilitySemiSync) + require.NoError(t, err, out) + + // Poll the database-state API to verify recovery ordering. + // The topology_recovery table has auto-incremented recovery_id values that + // reflect execution order. All ReplicaSemiSyncMustBeSet recovery_ids should + // be less than any PrimarySemiSyncMustBeSet recovery_id. + type tableState struct { + TableName string + Rows []map[string]any + } + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, response, err := utils.MakeAPICall(t, vtorc, "/api/database-state") + assert.NoError(c, err) + assert.Equal(c, 200, status) + + var tables []tableState + if !assert.NoError(c, json.Unmarshal([]byte(response), &tables)) { + return + } + + var maxReplicaRecoveryID, minPrimaryRecoveryID int + var replicaCount, primaryCount int + for _, table := range tables { + if table.TableName != "topology_recovery" { + continue + } + for _, row := range table.Rows { + analysis, _ := row["analysis"].(string) + recoveryIDStr, _ := row["recovery_id"].(string) + recoveryID, err := strconv.Atoi(recoveryIDStr) + if err != nil { + continue + } + switch inst.AnalysisCode(analysis) { + case inst.ReplicaSemiSyncMustBeSet: + replicaCount++ + if replicaCount == 1 || recoveryID > maxReplicaRecoveryID { + maxReplicaRecoveryID = recoveryID + } + case inst.PrimarySemiSyncMustBeSet: + primaryCount++ + if primaryCount == 1 || recoveryID < minPrimaryRecoveryID { + minPrimaryRecoveryID = recoveryID + } + } + } + } + + assert.Greater(c, replicaCount, 0, "should have ReplicaSemiSyncMustBeSet recoveries") + assert.Greater(c, primaryCount, 0, "should have PrimarySemiSyncMustBeSet recoveries") + if replicaCount > 0 && primaryCount > 0 { + assert.Less(c, maxReplicaRecoveryID, minPrimaryRecoveryID, + "all ReplicaSemiSyncMustBeSet recoveries should have lower recovery_id than PrimarySemiSyncMustBeSet") + } + }, 30*time.Second, time.Second) +} diff --git a/go/vt/vtorc/inst/analysis.go b/go/vt/vtorc/inst/analysis.go index 6bc6101581b..76a65bd46a8 100644 --- a/go/vt/vtorc/inst/analysis.go +++ b/go/vt/vtorc/inst/analysis.go @@ -21,6 +21,7 @@ import ( "time" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/vtctl/reparentutil/policy" "vitess.io/vitess/go/vt/vtorc/config" ) @@ -115,11 +116,13 @@ type DetectionAnalysis struct { CountReplicas uint CountValidReplicas uint CountValidReplicatingReplicas uint + CountValidSemiSyncReplicatingReplicas uint ReplicationStopped bool ErrantGTID string ReplicaNetTimeout int32 HeartbeatInterval float64 Analysis AnalysisCode + AnalysisMatchedProblems []*DetectionAnalysisProblemMeta Description string StructureAnalysis []StructureAnalysisCode OracleGTIDImmediateTopology bool @@ -148,6 +151,16 @@ type DetectionAnalysis struct { IsDiskStalled bool } +// hasMinSemiSyncAckers returns true if there are a minimum number of semi-sync ackers enabled and replicating. +// True is always returned if the durability policy does not require semi-sync ackers (eg: "none"). This gives +// a useful signal if it is safe to enable semi-sync without risk of stalling ongoing PRIMARY writes. +func hasMinSemiSyncAckers(durabler policy.Durabler, primary *topodatapb.Tablet, analysis *DetectionAnalysis) bool { + if durabler == nil || analysis == nil { + return false + } + return int(analysis.CountValidSemiSyncReplicatingReplicas) >= durabler.SemiSyncAckers(primary) +} + func (detectionAnalysis *DetectionAnalysis) MarshalJSON() ([]byte, error) { i := struct { DetectionAnalysis diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index 9b457bee8c2..359c9e22acf 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -18,7 +18,6 @@ package inst import ( "fmt" - "math" "time" "github.com/patrickmn/go-cache" @@ -196,6 +195,15 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi ), 0 ) AS count_valid_semi_sync_replicas, + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.replica_io_running != 0 + AND replica_instance.replica_sql_running != 0 + AND replica_instance.semi_sync_replica_enabled != 0 + ), + 0 + ) AS count_valid_semi_sync_replicating_replicas, IFNULL( SUM( replica_instance.log_bin @@ -350,6 +358,7 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi a.SemiSyncBlocked = m.GetBool("semi_sync_blocked") a.SemiSyncReplicaEnabled = m.GetBool("semi_sync_replica_enabled") a.CountSemiSyncReplicasEnabled = m.GetUint("count_semi_sync_replicas") + a.CountValidSemiSyncReplicatingReplicas = m.GetUint("count_valid_semi_sync_replicating_replicas") // countValidSemiSyncReplicasEnabled := m.GetUint("count_valid_semi_sync_replicas") a.SemiSyncPrimaryWaitForReplicaCount = m.GetUint("semi_sync_primary_wait_for_replica_count") a.SemiSyncPrimaryClients = m.GetUint("semi_sync_primary_clients") @@ -405,7 +414,7 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi // Increment the total number of tablets. ca.totalTablets += 1 if ca.hasShardWideAction { - // We can only take one shard level action at a time. + // We can only take one shard-wide action at a time. return nil } if ca.durability == nil { @@ -413,152 +422,30 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi return nil } isInvalid := m.GetBool("is_invalid") - switch { - case a.IsClusterPrimary && isInvalid: - a.Analysis = InvalidPrimary - a.Description = "VTOrc hasn't been able to reach the primary even once since restart/shutdown" - case isInvalid: - a.Analysis = InvalidReplica - a.Description = "VTOrc hasn't been able to reach the replica even once since restart/shutdown" - case a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled: - a.Analysis = PrimaryDiskStalled - a.Description = "Primary has a stalled disk" - ca.hasShardWideAction = true - case a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0: - a.Analysis = DeadPrimaryWithoutReplicas - a.Description = "Primary cannot be reached by vtorc and has no replica" - ca.hasShardWideAction = true - // - case a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0: - a.Analysis = DeadPrimary - a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating" - ca.hasShardWideAction = true - // - case a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0: - a.Analysis = DeadPrimaryAndReplicas - a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating" - ca.hasShardWideAction = true - // - case a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0: - a.Analysis = DeadPrimaryAndSomeReplicas - a.Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating" - ca.hasShardWideAction = true - // - case a.IsClusterPrimary && !a.IsPrimary: - a.Analysis = PrimaryHasPrimary - a.Description = "Primary is replicating from somewhere else" - ca.hasShardWideAction = true - // - case a.IsClusterPrimary && a.IsReadOnly: - a.Analysis = PrimaryIsReadOnly - a.Description = "Primary is read-only" - // - case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled: - a.Analysis = PrimarySemiSyncMustBeSet - a.Description = "Primary semi-sync must be set" - // - case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled: - a.Analysis = PrimarySemiSyncMustNotBeSet - a.Description = "Primary semi-sync must not be set" - // - case a.IsClusterPrimary && a.CurrentTabletType != topodatapb.TabletType_UNKNOWN && a.CurrentTabletType != topodatapb.TabletType_PRIMARY: - a.Analysis = PrimaryCurrentTypeMismatch - a.Description = "Primary tablet's current type is not PRIMARY" - case isStaleTopoPrimary(a, ca): - a.Analysis = StaleTopoPrimary - a.Description = "Primary tablet is stale, older than current primary" - case topo.IsReplicaType(a.TabletType) && a.ErrantGTID != "": - a.Analysis = ErrantGTIDDetected - a.Description = "Tablet has errant GTIDs" - case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp.IsZero(): - // ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either. - a.Analysis = ClusterHasNoPrimary - a.Description = "Cluster has no primary" - ca.hasShardWideAction = true - case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && !a.ShardPrimaryTermTimestamp.IsZero(): - // If there are no primary tablets, but the shard primary start time isn't empty, then we know - // the primary tablet was deleted. - a.Analysis = PrimaryTabletDeleted - a.Description = "Primary tablet has been deleted" - ca.hasShardWideAction = true - case a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount: - // The primary is reporting that semi-sync monitor is blocked on writes. - // There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked. - // There is some network diruption in progress. We should run an ERS. - a.Analysis = PrimarySemiSyncBlocked - a.Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs" - ca.hasShardWideAction = true - case topo.IsReplicaType(a.TabletType) && !a.IsReadOnly: - a.Analysis = ReplicaIsWritable - a.Description = "Replica is writable" - // - case topo.IsReplicaType(a.TabletType) && a.IsPrimary: - a.Analysis = NotConnectedToPrimary - a.Description = "Not connected to the primary" - // - case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && math.Round(a.HeartbeatInterval*2) != float64(a.ReplicaNetTimeout): - a.Analysis = ReplicaMisconfigured - a.Description = "Replica has been misconfigured" - // - case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != "" && a.AnalyzedInstancePrimaryAlias != ca.primaryAlias: - a.Analysis = ConnectedToWrongPrimary - a.Description = "Connected to wrong primary" - // - case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && a.ReplicationStopped: - a.Analysis = ReplicationStopped - a.Description = "Replication is stopped" - // - case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && !a.SemiSyncReplicaEnabled: - a.Analysis = ReplicaSemiSyncMustBeSet - a.Description = "Replica semi-sync must be set" - // - case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && a.SemiSyncReplicaEnabled: - a.Analysis = ReplicaSemiSyncMustNotBeSet - a.Description = "Replica semi-sync must not be set" - // - // TODO(sougou): Events below here are either ignored or not possible. - case a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0: - a.Analysis = UnreachablePrimaryWithLaggingReplicas - a.Description = "Primary cannot be reached by vtorc and all of its replicas are lagging" - // - case a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == a.CountValidReplicas: - // partial success is here to reduce noise - a.Analysis = UnreachablePrimary - a.Description = "Primary cannot be reached by vtorc but all of its replicas seem to be replicating; possibly a network/host issue" - // - case a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 && a.CountValidReplicatingReplicas < a.CountValidReplicas: - // partial success is here to reduce noise - a.Analysis = UnreachablePrimaryWithBrokenReplicas - a.Description = "Primary cannot be reached by vtorc but it has (some, but not all) replicating replicas; possibly a network/host issue" - // - case a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount: - if isStaleBinlogCoordinates { - a.Analysis = LockedSemiSyncPrimary - a.Description = "Semi sync primary is locked since it doesn't get enough replica acknowledgements" - } else { - a.Analysis = LockedSemiSyncPrimaryHypothesis - a.Description = "Semi sync primary seems to be locked, more samplings needed to validate" + var matchedProblems []*DetectionAnalysisProblem + for _, problem := range detectionAnalysisProblems { + // When isInvalid is true, instance data is unreliable (never been reached). + // Only InvalidPrimary/InvalidReplica should match; postProcessAnalyses + // handles upgrading InvalidPrimary to DeadPrimary if needed. + if isInvalid && problem.Meta.Analysis != InvalidPrimary && problem.Meta.Analysis != InvalidReplica { + continue + } + if problem.HasMatch(a, ca, primaryTablet, tablet, isInvalid, isStaleBinlogCoordinates) { + matchedProblems = append(matchedProblems, problem) + } + } + if len(matchedProblems) > 0 { + sortDetectionAnalysisMatchedProblems(matchedProblems) + for _, problem := range matchedProblems { + a.AnalysisMatchedProblems = append(a.AnalysisMatchedProblems, problem.Meta) } - // - case a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0: - a.Analysis = PrimarySingleReplicaNotReplicating - a.Description = "Primary is reachable but its single replica is not replicating" - case a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0: - a.Analysis = PrimarySingleReplicaDead - a.Description = "Primary is reachable but its single replica is dead" - // - case a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0: - a.Analysis = AllPrimaryReplicasNotReplicating - a.Description = "Primary is reachable but none of its replicas is replicating" - // - case a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0: - a.Analysis = AllPrimaryReplicasNotReplicatingOrDead - a.Description = "Primary is reachable but none of its replicas is replicating" - // - // case a.IsPrimary && a.CountReplicas == 0: - // a.Analysis = PrimaryWithoutReplicas - // a.Description = "Primary has no replicas" - // } + // We return a single problem per tablet. Any remaining problems will be discovered/recovered + // by VTOrc(s) on future polls. Often many problems are resolved by a single recovery of the + // first problem. The first element of matchedProblems is the highest-priority problem. + chosenProblem := matchedProblems[0] + a.Analysis = chosenProblem.Meta.Analysis + a.Description = chosenProblem.Meta.Description + ca.hasShardWideAction = chosenProblem.Meta.Priority == detectionAnalysisPriorityShardWideAction } { diff --git a/go/vt/vtorc/inst/analysis_dao_test.go b/go/vt/vtorc/inst/analysis_dao_test.go index a6d54896ad3..c9c5ccd7e12 100644 --- a/go/vt/vtorc/inst/analysis_dao_test.go +++ b/go/vt/vtorc/inst/analysis_dao_test.go @@ -63,7 +63,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ClusterHasNoPrimary", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -81,7 +81,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "PrimaryTabletDeleted", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -100,7 +100,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "StalledDiskPrimary", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -124,7 +124,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "PrimarySemiSyncBlocked", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -153,7 +153,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "LockedSemiSync", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -182,7 +182,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "DeadPrimary", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -205,7 +205,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "DeadPrimaryWithoutReplicas", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -226,7 +226,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "DeadPrimaryAndReplicas", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -247,7 +247,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "DeadPrimaryAndSomeReplicas", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -270,7 +270,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "PrimaryHasPrimary", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -292,7 +292,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "PrimaryIsReadOnly", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -315,7 +315,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "PrimaryCurrentTypeMismatch", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -337,7 +337,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "Unknown tablet type shouldn't run the mismatch recovery analysis", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -362,7 +362,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "PrimarySemiSyncMustNotBeSet", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -385,7 +385,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "PrimarySemiSyncMustBeSet", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -393,13 +393,15 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { MysqlHostname: "localhost", MysqlPort: 6709, }, - DurabilityPolicy: policy.DurabilitySemiSync, - LastCheckValid: 1, - CountReplicas: 4, - CountValidReplicas: 4, - IsPrimary: 1, - SemiSyncPrimaryEnabled: 0, - CurrentTabletType: int(topodatapb.TabletType_PRIMARY), + DurabilityPolicy: policy.DurabilitySemiSync, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 4, + CountValidSemiSyncReplicatingReplicas: 1, + IsPrimary: 1, + SemiSyncPrimaryEnabled: 0, + CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }}, keyspaceWanted: "ks", shardWanted: "0", @@ -408,7 +410,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "NotConnectedToPrimary", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -427,7 +429,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -446,7 +448,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ReplicaIsWritable", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -465,7 +467,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -475,7 +477,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { }, DurabilityPolicy: policy.DurabilityNone, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, LastCheckValid: 1, ReadOnly: 0, @@ -487,7 +489,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ConnectedToWrongPrimary", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -506,7 +508,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -516,7 +518,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { }, DurabilityPolicy: policy.DurabilityNone, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 102}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 102}, }, LastCheckValid: 1, ReadOnly: 1, @@ -528,7 +530,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ReplicationStopped", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -547,7 +549,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -557,7 +559,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { }, DurabilityPolicy: policy.DurabilityNone, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, LastCheckValid: 1, ReadOnly: 1, @@ -570,7 +572,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "No recoveries on drained tablets", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -589,7 +591,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -599,7 +601,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { }, DurabilityPolicy: policy.DurabilityNone, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, LastCheckValid: 1, ReadOnly: 1, @@ -612,7 +614,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ReplicaMisconfigured", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -631,7 +633,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -641,7 +643,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { }, DurabilityPolicy: policy.DurabilityNone, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, LastCheckValid: 1, ReadOnly: 1, @@ -656,7 +658,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ReplicaSemiSyncMustBeSet", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -676,7 +678,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -685,7 +687,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { MysqlPort: 6709, }, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, DurabilityPolicy: policy.DurabilitySemiSync, LastCheckValid: 1, @@ -699,7 +701,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ReplicaSemiSyncMustNotBeSet", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -718,7 +720,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -727,7 +729,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { MysqlPort: 6709, }, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, DurabilityPolicy: policy.DurabilityNone, LastCheckValid: 1, @@ -741,7 +743,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "SnapshotKeyspace", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -761,7 +763,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "EmptyDurabilityPolicy", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -781,7 +783,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "Empty database_instance table", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -801,7 +803,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -819,7 +821,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "DeadPrimary when VTOrc is starting up", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -831,7 +833,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { IsInvalid: 1, }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -843,7 +845,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { ReplicationStopped: 1, }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 103}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 103}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -861,7 +863,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "Invalid Primary", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -879,7 +881,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ErrantGTID", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -898,7 +900,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -909,7 +911,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { DurabilityPolicy: policy.DurabilityNone, ErrantGTID: "some errant GTID", PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, LastCheckValid: 1, ReadOnly: 1, @@ -921,7 +923,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { name: "ErrantGTID on a non-replica", info: []*test.InfoForRecoveryAnalysis{{ TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -940,7 +942,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { CurrentTabletType: int(topodatapb.TabletType_PRIMARY), }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -951,7 +953,7 @@ func TestGetDetectionAnalysisDecision(t *testing.T) { DurabilityPolicy: policy.DurabilityNone, ErrantGTID: "some errant GTID", PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, LastCheckValid: 1, ReadOnly: 1, @@ -1010,7 +1012,7 @@ func TestStalePrimary(t *testing.T) { info := []*test.InfoForRecoveryAnalysis{ { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -1034,7 +1036,7 @@ func TestStalePrimary(t *testing.T) { }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100}, Hostname: "localhost", Keyspace: "ks", Shard: "0", @@ -1044,7 +1046,7 @@ func TestStalePrimary(t *testing.T) { }, DurabilityPolicy: policy.DurabilitySemiSync, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101}, }, LastCheckValid: 1, ReadOnly: 1, @@ -1053,7 +1055,7 @@ func TestStalePrimary(t *testing.T) { }, { TabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 102}, + Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 102}, Hostname: "localhost", Keyspace: "ks", Shard: "0", diff --git a/go/vt/vtorc/inst/analysis_problem.go b/go/vt/vtorc/inst/analysis_problem.go new file mode 100644 index 00000000000..fa22e5354dc --- /dev/null +++ b/go/vt/vtorc/inst/analysis_problem.go @@ -0,0 +1,523 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package inst + +import ( + "math" + "slices" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/vtctl/reparentutil/policy" +) + +const ( + detectionAnalysisPriorityShardWideAction = iota + detectionAnalysisPriorityCritical + detectionAnalysisPriorityHigh + detectionAnalysisPriorityMedium + detectionAnalysisPriorityLow +) + +// DetectionAnalysisProblemMeta contains basic metadata describing a problem. +type DetectionAnalysisProblemMeta struct { + // Analysis is the AnalysisCode representing the type of problem. + Analysis AnalysisCode + + // Description is a human-readable description of the problem. + Description string + + // Priority is an integer influencing the priority sorting of problems. A lower + // number is considered to be higher in the sort, with 0 being the top-priority. + Priority int +} + +// DetectionAnalysisProblem describes how to match, sort and track a problem. +type DetectionAnalysisProblem struct { + // Meta contains the metadata describing a problem. + Meta *DetectionAnalysisProblemMeta + + // AfterAnalyses defines problems that must be recovered before this problem. + AfterAnalyses []AnalysisCode + + // BeforeAnalyses defines problems that must be recovered after this problem. + BeforeAnalyses []AnalysisCode + + // MatchFunc is a function that returns true when the provided conditions match this problem. + MatchFunc func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool +} + +// RequiresOrderedExecution returns true if the problem must be executed +// sequentially relative to other problems in the same shard. +func (dap *DetectionAnalysisProblem) RequiresOrderedExecution() bool { + return dap.Meta.Priority == detectionAnalysisPriorityShardWideAction || len(dap.BeforeAnalyses) > 0 || len(dap.AfterAnalyses) > 0 +} + +// GetPriority returns the priority of a problem as an int. +func (dap *DetectionAnalysisProblem) GetPriority() int { + if dap.Meta == nil { + return 0 + } + return dap.Meta.Priority +} + +// GetDetectionAnalysisProblem returns the DetectionAnalysisProblem for the given AnalysisCode. +func GetDetectionAnalysisProblem(code AnalysisCode) *DetectionAnalysisProblem { + for _, p := range detectionAnalysisProblems { + if p.Meta.Analysis == code { + return p + } + } + return nil +} + +// HasMatch returns true if a DetectionAnalysisProblem matches the provided states. +func (dap *DetectionAnalysisProblem) HasMatch(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + if a == nil || ca == nil || dap.MatchFunc == nil { + return false + } + return dap.MatchFunc(a, ca, primary, tablet, isInvalid, isStaleBinlogCoordinates) +} + +// detectionAnalysisProblems contains all possible problems to match during detection analysis. +var detectionAnalysisProblems = []*DetectionAnalysisProblem{ + // InvalidPrimary and InvalidReplica + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: InvalidPrimary, + Description: "VTOrc hasn't been able to reach the primary even once since restart/shutdown", + Priority: detectionAnalysisPriorityCritical, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && isInvalid + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: InvalidReplica, + Description: "VTOrc hasn't been able to reach the replica even once since restart/shutdown", + Priority: detectionAnalysisPriorityLow, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return isInvalid + }, + }, + + // PrimaryDiskStalled + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimaryDiskStalled, + Description: "Primary has a stalled disk", + Priority: detectionAnalysisPriorityShardWideAction, + }, + BeforeAnalyses: []AnalysisCode{DeadPrimary, DeadPrimaryAndReplicas, DeadPrimaryAndSomeReplicas, DeadPrimaryWithoutReplicas}, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled + }, + }, + + // DeadPrimary* + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: DeadPrimaryWithoutReplicas, + Description: "Primary cannot be reached by vtorc and has no replica", + Priority: detectionAnalysisPriorityShardWideAction, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: DeadPrimary, + Description: "Primary cannot be reached by vtorc and none of its replicas is replicating", + Priority: detectionAnalysisPriorityShardWideAction, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: DeadPrimaryAndReplicas, + Description: "Primary cannot be reached by vtorc and none of its replicas is replicating", + Priority: detectionAnalysisPriorityShardWideAction, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: DeadPrimaryAndSomeReplicas, + Description: "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating", + Priority: detectionAnalysisPriorityShardWideAction, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 + }, + }, + + // PrimaryHasPrimary + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimaryHasPrimary, + Description: "Primary is replicating from somewhere else", + Priority: detectionAnalysisPriorityShardWideAction, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && !a.IsPrimary + }, + }, + + // MySQL read-only checks + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimaryIsReadOnly, + Description: "Primary is read-only", + Priority: detectionAnalysisPriorityHigh, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && a.IsReadOnly + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ReplicaIsWritable, + Description: "Replica is writable", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && !a.IsReadOnly + }, + }, + + // Semi-sync checks + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimarySemiSyncMustBeSet, + Description: "Primary semi-sync must be set", + Priority: detectionAnalysisPriorityMedium, + }, + AfterAnalyses: []AnalysisCode{ReplicaSemiSyncMustBeSet}, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + if !hasMinSemiSyncAckers(ca.durability, primary, a) { + return false + } + return a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimarySemiSyncMustNotBeSet, + Description: "Primary semi-sync must not be set", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ReplicaSemiSyncMustBeSet, + Description: "Replica semi-sync must be set", + Priority: detectionAnalysisPriorityMedium, + }, + BeforeAnalyses: []AnalysisCode{PrimarySemiSyncMustBeSet}, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && !a.IsPrimary && policy.IsReplicaSemiSync(ca.durability, primary, tablet) && !a.SemiSyncReplicaEnabled + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ReplicaSemiSyncMustNotBeSet, + Description: "Replica semi-sync must not be set", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !policy.IsReplicaSemiSync(ca.durability, primary, tablet) && a.SemiSyncReplicaEnabled + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimarySemiSyncBlocked, + Description: "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs", + Priority: detectionAnalysisPriorityShardWideAction, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount + }, + }, + + // Primary tablet type checks + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimaryCurrentTypeMismatch, + Description: "Primary tablet's current type is not PRIMARY", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsClusterPrimary && a.CurrentTabletType != topodatapb.TabletType_UNKNOWN && a.CurrentTabletType != topodatapb.TabletType_PRIMARY + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: StaleTopoPrimary, + Description: "Primary tablet is stale, older than current primary", + Priority: detectionAnalysisPriorityHigh, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return isStaleTopoPrimary(a, ca) + }, + }, + + // Errant GTID + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ErrantGTIDDetected, + Description: "Tablet has errant GTIDs", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && a.ErrantGTID != "" + }, + }, + + // Cluster primary checks + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ClusterHasNoPrimary, + Description: "Cluster has no primary", + Priority: detectionAnalysisPriorityShardWideAction, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp.IsZero() + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimaryTabletDeleted, + Description: "Primary tablet has been deleted", + Priority: detectionAnalysisPriorityShardWideAction, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && !a.ShardPrimaryTermTimestamp.IsZero() + }, + }, + + // Replica connectivity checks + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: NotConnectedToPrimary, + Description: "Not connected to the primary", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && a.IsPrimary + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ReplicaMisconfigured, + Description: "Replica has been misconfigured", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && !a.IsPrimary && math.Round(a.HeartbeatInterval*2) != float64(a.ReplicaNetTimeout) + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ConnectedToWrongPrimary, + Description: "Connected to wrong primary", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != "" && a.AnalyzedInstancePrimaryAlias != ca.primaryAlias + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ReplicationStopped, + Description: "Replication is stopped", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return topo.IsReplicaType(a.TabletType) && !a.IsPrimary && a.ReplicationStopped + }, + }, + + // Unreachable primary checks + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: UnreachablePrimaryWithLaggingReplicas, + Description: "Primary cannot be reached by vtorc and all of its replicas are lagging", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: UnreachablePrimary, + Description: "Primary cannot be reached by vtorc but all of its replicas seem to be replicating; possibly a network/host issue", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == a.CountValidReplicas + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: UnreachablePrimaryWithBrokenReplicas, + Description: "Primary cannot be reached by vtorc but it has (some, but not all) replicating replicas; possibly a network/host issue", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 && a.CountValidReplicatingReplicas < a.CountValidReplicas + }, + }, + + // Locked semi-sync primary + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: LockedSemiSyncPrimary, + Description: "Semi sync primary is locked since it doesn't get enough replica acknowledgements", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount && isStaleBinlogCoordinates + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: LockedSemiSyncPrimaryHypothesis, + Description: "Semi sync primary seems to be locked, more samplings needed to validate", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount && !isStaleBinlogCoordinates + }, + }, + + // Primary replica health checks + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimarySingleReplicaNotReplicating, + Description: "Primary is reachable but its single replica is not replicating", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimarySingleReplicaDead, + Description: "Primary is reachable but its single replica is dead", + Priority: detectionAnalysisPriorityMedium, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0 + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: AllPrimaryReplicasNotReplicating, + Description: "Primary is reachable but none of its replicas is replicating", + Priority: detectionAnalysisPriorityLow, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: AllPrimaryReplicasNotReplicatingOrDead, + Description: "Primary is reachable but none of its replicas is replicating", + Priority: detectionAnalysisPriorityLow, + }, + MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool { + return a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 + }, + }, +} + +func sortDetectionAnalysisMatchedProblems(allProblems []*DetectionAnalysisProblem) { + // use slices.SortStableFunc because it keeps the original order of equal elements. + slices.SortStableFunc(allProblems, compareDetectionAnalysisProblems) +} + +// compareDetectionAnalysisProblems compares two DetectionAnalysisProblems using +// the same logic as sortDetectionAnalysisMatchedProblems. +func compareDetectionAnalysisProblems(a, b *DetectionAnalysisProblem) int { + if a.Meta == nil || b.Meta == nil { + return 0 + } + + // handle before/after dependencies + aAnalysis := a.Meta.Analysis + bAnalysis := b.Meta.Analysis + if slices.Contains(b.BeforeAnalyses, aAnalysis) || slices.Contains(a.AfterAnalyses, bAnalysis) { + return 1 + } + if slices.Contains(a.BeforeAnalyses, bAnalysis) || slices.Contains(b.AfterAnalyses, aAnalysis) { + return -1 + } + + // effective priority (lower is better). + aPriority := a.GetPriority() + bPriority := b.GetPriority() + switch { + case aPriority > bPriority: + return 1 + case aPriority < bPriority: + return -1 + } + + return 0 +} + +// sortDetectionAnalyses sorts a slice of DetectionAnalysis by looking up each +// entry's Analysis code in detectionAnalysisProblems and comparing using the +// same priority/dependency logic as sortDetectionAnalysisMatchedProblems. +func sortDetectionAnalyses(analyses []*DetectionAnalysis) { + slices.SortStableFunc(analyses, func(a, b *DetectionAnalysis) int { + aProblem := GetDetectionAnalysisProblem(a.Analysis) + bProblem := GetDetectionAnalysisProblem(b.Analysis) + if aProblem == nil || bProblem == nil { + return 0 + } + return compareDetectionAnalysisProblems(aProblem, bProblem) + }) +} + +// GroupDetectionAnalysesByShard groups a slice of DetectionAnalysis by shard key +// (topoproto.KeyspaceShardString) and sorts each group by priority. +func GroupDetectionAnalysesByShard(analyses []*DetectionAnalysis) map[string][]*DetectionAnalysis { + result := make(map[string][]*DetectionAnalysis) + for _, a := range analyses { + key := topoproto.KeyspaceShardString(a.AnalyzedKeyspace, a.AnalyzedShard) + result[key] = append(result[key], a) + } + for _, group := range result { + sortDetectionAnalyses(group) + } + return result +} diff --git a/go/vt/vtorc/inst/analysis_problem_test.go b/go/vt/vtorc/inst/analysis_problem_test.go new file mode 100644 index 00000000000..536d763bb8c --- /dev/null +++ b/go/vt/vtorc/inst/analysis_problem_test.go @@ -0,0 +1,257 @@ +/* +Copyright 2026 The Vitess Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package inst + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" +) + +func TestSortDetectionAnalysisMatchedProblems(t *testing.T) { + worstPriority := 10 + testCases := []struct { + name string + in []*DetectionAnalysisProblem + postSortByAnalysis []AnalysisCode + }{ + { + name: "default", + in: []*DetectionAnalysisProblem{ + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: InvalidReplica, + Description: "should be 2nd-last, not a shardWideAction, low priority", + Priority: detectionAnalysisPriorityLow, + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: InvalidReplica, + Description: "should be last, not a shardWideAction, worst priority", + Priority: worstPriority, + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimaryIsReadOnly, + Description: "should be after DeadPrimary, high priority", + Priority: detectionAnalysisPriorityHigh, + }, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: PrimarySemiSyncMustBeSet, + Description: "should be after ReplicaSemiSyncMustBeSet, has an after dependency", + Priority: detectionAnalysisPriorityMedium, + }, + AfterAnalyses: []AnalysisCode{ReplicaSemiSyncMustBeSet}, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: ReplicaSemiSyncMustBeSet, + Description: "should be before PrimarySemiSyncMustBeSet, has a before dependency", + Priority: detectionAnalysisPriorityMedium, + }, + BeforeAnalyses: []AnalysisCode{PrimarySemiSyncMustBeSet}, + }, + { + Meta: &DetectionAnalysisProblemMeta{ + Analysis: DeadPrimary, + Description: "should be 1st, shard-wide action priority", + Priority: detectionAnalysisPriorityShardWideAction, + }, + }, + }, + postSortByAnalysis: []AnalysisCode{ + DeadPrimary, + PrimaryIsReadOnly, + ReplicaSemiSyncMustBeSet, + PrimarySemiSyncMustBeSet, + InvalidReplica, + InvalidReplica, + }, + }, + } + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + sorted := testCase.in + sortDetectionAnalysisMatchedProblems(sorted) + + require.Len(t, sorted, len(testCase.postSortByAnalysis)) + for i, analysis := range testCase.postSortByAnalysis { + assert.Equal(t, analysis, sorted[i].Meta.Analysis) + } + + // confirm last problem has the worstPriority + require.Equal(t, worstPriority, sorted[len(sorted)-1].Meta.Priority) + }) + } +} + +func TestRequiresOrderedExecution(t *testing.T) { + tests := []struct { + name string + problem *DetectionAnalysisProblem + expected bool + }{ + { + name: "shard-wide action priority", + problem: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityShardWideAction}, + }, + expected: true, + }, + { + name: "critical priority", + problem: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityCritical}, + }, + expected: false, + }, + { + name: "has BeforeAnalyses", + problem: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{}, + BeforeAnalyses: []AnalysisCode{DeadPrimary}, + }, + expected: true, + }, + { + name: "has AfterAnalyses", + problem: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{}, + AfterAnalyses: []AnalysisCode{DeadPrimary}, + }, + expected: true, + }, + { + name: "independent problem", + problem: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityLow}, + }, + expected: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, tt.problem.RequiresOrderedExecution()) + }) + } +} + +func TestGetDetectionAnalysisProblem(t *testing.T) { + problem := GetDetectionAnalysisProblem(DeadPrimary) + require.NotNil(t, problem) + assert.Equal(t, DeadPrimary, problem.Meta.Analysis) + + problem = GetDetectionAnalysisProblem("NonExistentCode") + assert.Nil(t, problem) +} + +func TestCompareDetectionAnalysisProblems(t *testing.T) { + tests := []struct { + name string + a, b *DetectionAnalysisProblem + expected int + }{ + { + name: "shard-wide action beats non-shard-wide", + a: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityShardWideAction}, + }, + b: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityHigh}, + }, + expected: -1, + }, + { + name: "higher priority wins", + a: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityHigh}, + }, + b: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityLow}, + }, + expected: -1, + }, + { + name: "equal priority", + a: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityMedium}, + }, + b: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Priority: detectionAnalysisPriorityMedium}, + }, + expected: 0, + }, + { + name: "before dependency", + a: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Analysis: ReplicaSemiSyncMustBeSet}, + BeforeAnalyses: []AnalysisCode{PrimarySemiSyncMustBeSet}, + }, + b: &DetectionAnalysisProblem{ + Meta: &DetectionAnalysisProblemMeta{Analysis: PrimarySemiSyncMustBeSet}, + }, + expected: -1, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, compareDetectionAnalysisProblems(tt.a, tt.b)) + }) + } +} + +func TestGroupDetectionAnalysesByShard(t *testing.T) { + analyses := []*DetectionAnalysis{ + { + Analysis: ReplicationStopped, + AnalyzedKeyspace: "ks1", + AnalyzedShard: "0", + TabletType: topodatapb.TabletType_REPLICA, + }, + { + Analysis: DeadPrimary, + AnalyzedKeyspace: "ks1", + AnalyzedShard: "0", + TabletType: topodatapb.TabletType_PRIMARY, + }, + { + Analysis: PrimaryIsReadOnly, + AnalyzedKeyspace: "ks2", + AnalyzedShard: "0", + TabletType: topodatapb.TabletType_PRIMARY, + }, + } + + result := GroupDetectionAnalysesByShard(analyses) + + require.Len(t, result, 2) + + // ks1/0 should have 2 entries, sorted with DeadPrimary first + ks1 := result["ks1/0"] + require.Len(t, ks1, 2) + assert.Equal(t, DeadPrimary, ks1[0].Analysis) + assert.Equal(t, ReplicationStopped, ks1[1].Analysis) + + // ks2/0 should have 1 entry + ks2 := result["ks2/0"] + require.Len(t, ks2, 1) + assert.Equal(t, PrimaryIsReadOnly, ks2[0].Analysis) +} diff --git a/go/vt/vtorc/inst/analysis_test.go b/go/vt/vtorc/inst/analysis_test.go new file mode 100644 index 00000000000..60aaeb790d0 --- /dev/null +++ b/go/vt/vtorc/inst/analysis_test.go @@ -0,0 +1,70 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package inst + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/vtctl/reparentutil/policy" +) + +func TestHasMinSemiSyncAckers(t *testing.T) { + durablerNone, _ := policy.GetDurabilityPolicy("none") + durablerCrossCell, _ := policy.GetDurabilityPolicy("cross_cell") + tablet := &topodatapb.Tablet{Keyspace: t.Name(), Shard: "-"} + + testCases := []struct { + name string + durabler policy.Durabler + analysis *DetectionAnalysis + expect bool + }{ + { + name: "durability policy none", + analysis: &DetectionAnalysis{ + CountValidSemiSyncReplicatingReplicas: 0, + }, + durabler: durablerNone, + expect: true, + }, + { + name: "durability policy cross_cell without min ackers", + durabler: durablerCrossCell, + analysis: &DetectionAnalysis{ + CountValidSemiSyncReplicatingReplicas: 0, + }, + expect: false, + }, + { + name: "durability policy cross_cell with min ackers", + durabler: durablerCrossCell, + analysis: &DetectionAnalysis{ + CountValidSemiSyncReplicatingReplicas: uint(durablerCrossCell.SemiSyncAckers(tablet)), + }, + expect: true, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + assert.Equal(t, testCase.expect, hasMinSemiSyncAckers(testCase.durabler, tablet, testCase.analysis)) + }) + } +} diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index b719c1453fc..0e8f73b43c5 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -21,7 +21,10 @@ import ( "encoding/json" "errors" "fmt" + "maps" "math/rand/v2" + "slices" + "sync" "sync/atomic" "time" @@ -968,6 +971,32 @@ func checkIfAlreadyFixed(analysisEntry *inst.DetectionAnalysis) (bool, error) { return true, nil } +// recoverShardAnalyses executes recoveries for a shard's analyses. Analyses +// that require ordered execution run sequentially first, then the remaining +// independent analyses fan out concurrently. +func recoverShardAnalyses(analyses []*inst.DetectionAnalysis, recoverFunc func(*inst.DetectionAnalysis) error) { + var concurrent []*inst.DetectionAnalysis + for _, analysisEntry := range analyses { + problem := inst.GetDetectionAnalysisProblem(analysisEntry.Analysis) + if problem != nil && problem.RequiresOrderedExecution() { + if err := recoverFunc(analysisEntry); err != nil { + log.Error(fmt.Sprintf("Failed to execute CheckAndRecover function: %+v", err)) + } + } else { + concurrent = append(concurrent, analysisEntry) + } + } + var wg sync.WaitGroup + for _, analysisEntry := range concurrent { + wg.Go(func() { + if err := recoverFunc(analysisEntry); err != nil { + log.Error(fmt.Sprintf("Failed to execute CheckAndRecover function: %+v", err)) + } + }) + } + wg.Wait() +} + // CheckAndRecover is the main entry point for the recovery mechanism func CheckAndRecover() { // Allow the analysis to run even if we don't want to recover @@ -977,22 +1006,26 @@ func CheckAndRecover() { return } + analysisByShard := inst.GroupDetectionAnalysesByShard(detectionAnalysis) + // Regardless of if the problem is solved or not we want to monitor active // issues, we use a map of labels and set a counter to `1` for each problem // then we reset any counter that is not present in the current analysis. active := make(map[string]struct{}) - for _, e := range detectionAnalysis { - if e.Analysis != inst.NoProblem { - names := [...]string{ - string(e.Analysis), - e.AnalyzedInstanceAlias, - e.AnalyzedKeyspace, - e.AnalyzedShard, + for _, shardAnalyses := range analysisByShard { + for _, e := range shardAnalyses { + if e.Analysis != inst.NoProblem { + names := [...]string{ + string(e.Analysis), + e.AnalyzedInstanceAlias, + e.AnalyzedKeyspace, + e.AnalyzedShard, + } + + key := detectedProblems.GetLabelName(names[:]...) + active[key] = struct{}{} + detectedProblems.Set(names[:], 1) } - - key := detectedProblems.GetLabelName(names[:]...) - active[key] = struct{}{} - detectedProblems.Set(names[:], 1) } } @@ -1003,14 +1036,18 @@ func CheckAndRecover() { } } - // intentionally iterating entries in random order - for _, j := range rand.Perm(len(detectionAnalysis)) { - analysisEntry := detectionAnalysis[j] - + // Shuffle shard keys to ensure random processing order. Randomness helps reduce + // global shard lock contention when many VTOrcs watch the same shard(s). Within + // each shard, analyses are sorted by priority. Problems that require ordered + // execution (shard-wide actions or those with Before/After dependencies) run + // sequentially first, then independent problems fan out concurrently. + shardKeys := slices.Collect(maps.Keys(analysisByShard)) + rand.Shuffle(len(shardKeys), func(i, j int) { + shardKeys[i], shardKeys[j] = shardKeys[j], shardKeys[i] + }) + for _, key := range shardKeys { go func() { - if err := executeCheckAndRecoverFunction(analysisEntry); err != nil { - log.Error(err) - } + recoverShardAnalyses(analysisByShard[key], executeCheckAndRecoverFunction) }() } } diff --git a/go/vt/vtorc/logic/topology_recovery_test.go b/go/vt/vtorc/logic/topology_recovery_test.go index 0b81d320805..c22717bed17 100644 --- a/go/vt/vtorc/logic/topology_recovery_test.go +++ b/go/vt/vtorc/logic/topology_recovery_test.go @@ -18,6 +18,7 @@ package logic import ( "context" + "sync" "testing" "vitess.io/vitess/go/vt/log" @@ -524,5 +525,35 @@ func TestRecheckPrimaryHealth(t *testing.T) { require.NoError(t, err) }) } +} + +func TestRecoverShardAnalyses(t *testing.T) { + // DeadPrimary and PrimaryHasPrimary have detectionAnalysisPriorityShardWideAction, + // so they require ordered execution. ReplicationStopped and ReplicaIsWritable are + // medium priority with no shard-wide action or before/after dependencies, + // so they run concurrently. + analyses := []*inst.DetectionAnalysis{ + {Analysis: inst.ReplicationStopped, AnalyzedInstanceAlias: "replica1"}, + {Analysis: inst.DeadPrimary, AnalyzedInstanceAlias: "primary1"}, + {Analysis: inst.ReplicaIsWritable, AnalyzedInstanceAlias: "replica2"}, + {Analysis: inst.PrimaryHasPrimary, AnalyzedInstanceAlias: "primary2"}, + } + + var mu sync.Mutex + var order []inst.AnalysisCode + recoverFunc := func(entry *inst.DetectionAnalysis) error { + mu.Lock() + defer mu.Unlock() + order = append(order, entry.Analysis) + return nil + } + + recoverShardAnalyses(analyses, recoverFunc) + require.Len(t, order, 4) + // Ordered recoveries must come first, in their original order. + require.Equal(t, inst.DeadPrimary, order[0]) + require.Equal(t, inst.PrimaryHasPrimary, order[1]) + // Concurrent recoveries come after, in any order. + require.ElementsMatch(t, []inst.AnalysisCode{inst.ReplicationStopped, inst.ReplicaIsWritable}, order[2:]) } diff --git a/go/vt/vtorc/test/recovery_analysis.go b/go/vt/vtorc/test/recovery_analysis.go index 64505659ca5..cb80494804f 100644 --- a/go/vt/vtorc/test/recovery_analysis.go +++ b/go/vt/vtorc/test/recovery_analysis.go @@ -18,6 +18,7 @@ package test import ( "fmt" + "strconv" "time" "google.golang.org/protobuf/encoding/prototext" @@ -74,6 +75,7 @@ type InfoForRecoveryAnalysis struct { CountMixedBasedLoggingReplicas uint CountRowBasedLoggingReplicas uint CountDistinctMajorVersionsLoggingReplicas uint + CountValidSemiSyncReplicatingReplicas uint CountDelayedReplicas uint CountLaggingReplicas uint MinReplicaGTIDMode string @@ -97,14 +99,15 @@ func (info *InfoForRecoveryAnalysis) ConvertToRowMap() sqlutils.RowMap { rowMap["count_logging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountLoggingReplicas), Valid: true} rowMap["count_mixed_based_logging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountMixedBasedLoggingReplicas), Valid: true} rowMap["count_oracle_gtid_replicas"] = sqlutils.CellData{Valid: false} - rowMap["count_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountReplicas), Valid: true} - rowMap["count_row_based_logging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountRowBasedLoggingReplicas), Valid: true} - rowMap["count_semi_sync_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountSemiSyncReplicasEnabled), Valid: true} - rowMap["count_statement_based_logging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountStatementBasedLoggingReplicas), Valid: true} - rowMap["count_valid_binlog_server_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidBinlogServerReplicas), Valid: true} - rowMap["count_valid_oracle_gtid_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidOracleGTIDReplicas), Valid: true} - rowMap["count_valid_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidReplicas), Valid: true} - rowMap["count_valid_replicating_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidReplicatingReplicas), Valid: true} + rowMap["count_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountReplicas), 10), Valid: true} + rowMap["count_row_based_logging_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountRowBasedLoggingReplicas), 10), Valid: true} + rowMap["count_semi_sync_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountSemiSyncReplicasEnabled), 10), Valid: true} + rowMap["count_statement_based_logging_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountStatementBasedLoggingReplicas), 10), Valid: true} + rowMap["count_valid_binlog_server_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountValidBinlogServerReplicas), 10), Valid: true} + rowMap["count_valid_oracle_gtid_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountValidOracleGTIDReplicas), 10), Valid: true} + rowMap["count_valid_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountValidReplicas), 10), Valid: true} + rowMap["count_valid_replicating_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountValidReplicatingReplicas), 10), Valid: true} + rowMap["count_valid_semi_sync_replicating_replicas"] = sqlutils.CellData{String: strconv.FormatUint(uint64(info.CountValidSemiSyncReplicatingReplicas), 10), Valid: true} rowMap["downtime_end_timestamp"] = sqlutils.CellData{String: info.DowntimeEndTimestamp, Valid: true} rowMap["downtime_remaining_seconds"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.DowntimeRemainingSeconds), Valid: true} rowMap["durability_policy"] = sqlutils.CellData{String: info.DurabilityPolicy, Valid: true}