diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index 0e480f74ee8..254b72dc916 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -51,10 +51,10 @@ func initializeAnalysisDaoPostConfiguration() { } type clusterAnalysis struct { - hasClusterwideAction bool - totalTablets int - primaryAlias string - durability policy.Durabler + hasShardWideAction bool + totalTablets int + primaryAlias string + durability policy.Durabler } // GetReplicationAnalysis will check for replication problems (dead primary; unreachable primary; etc) @@ -396,8 +396,8 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna ca := clusters[keyspaceShard] // Increment the total number of tablets. ca.totalTablets += 1 - if ca.hasClusterwideAction { - // We can only take one cluster level action at a time. + if ca.hasShardWideAction { + // We can only take one shard level action at a time. return nil } if ca.durability == nil { @@ -414,31 +414,31 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna } else if a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled { a.Analysis = PrimaryDiskStalled a.Description = "Primary has a stalled disk" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true } else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 { a.Analysis = DeadPrimaryWithoutReplicas a.Description = "Primary cannot be reached by vtorc and has no replica" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true // } else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { a.Analysis = DeadPrimary a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true // } else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 { a.Analysis = DeadPrimaryAndReplicas a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true // } else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { a.Analysis = DeadPrimaryAndSomeReplicas a.Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true // } else if a.IsClusterPrimary && !a.IsPrimary { a.Analysis = PrimaryHasPrimary a.Description = "Primary is replicating from somewhere else" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true // } else if a.IsClusterPrimary && a.IsReadOnly { a.Analysis = PrimaryIsReadOnly @@ -462,20 +462,20 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna // ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either. a.Analysis = ClusterHasNoPrimary a.Description = "Cluster has no primary" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true } else if topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp != "" { // If there are no primary tablets, but the shard primary start time isn't empty, then we know // the primary tablet was deleted. a.Analysis = PrimaryTabletDeleted a.Description = "Primary tablet has been deleted" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true } else if a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount { // The primary is reporting that semi-sync monitor is blocked on writes. // There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked. // There is some network diruption in progress. We should run an ERS. a.Analysis = PrimarySemiSyncBlocked a.Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs" - ca.hasClusterwideAction = true + ca.hasShardWideAction = true } else if topo.IsReplicaType(a.TabletType) && !a.IsReadOnly { a.Analysis = ReplicaIsWritable a.Description = "Replica is writable" diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 25b79e1f1bc..c8874acdf00 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -485,8 +485,8 @@ func getRecoverFunctionName(recoveryFunctionCode recoveryFunction) string { } } -// isClusterWideRecovery returns whether the given recovery is a cluster-wide recovery or not -func isClusterWideRecovery(recoveryFunctionCode recoveryFunction) bool { +// isShardWideRecovery returns whether the given recovery is a recovery that affects all tablets in a shard +func isShardWideRecovery(recoveryFunctionCode recoveryFunction) bool { switch recoveryFunctionCode { case recoverDeadPrimaryFunc, electNewPrimaryFunc, recoverPrimaryTabletDeletedFunc: return true @@ -553,13 +553,13 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er // Prioritise primary recovery. // If we are performing some other action, first ensure that it is not because of primary issues. - // This step is only meant to improve the time taken to detect and fix cluster wide recoveries, it does not impact correctness. + // This step is only meant to improve the time taken to detect and fix shard-wide recoveries, it does not impact correctness. // If a VTOrc detects an issue on a replica like ReplicationStopped, the underlying cause could be a dead primary instead. // So, we try to reload that primary's information before proceeding with the replication stopped fix. We do this before acquiring the shard lock // to allow another VTOrc instance to proceed with the dead primary recovery if it is indeed the case and it detects it before us. If however, the primary // is not dead, then we will proceed with the fix for the replica. Essentially, we are trading off speed in replica recoveries (by doing an additional primary tablet reload) - // for speed in cluster-wide recoveries (by not holding the shard lock before reloading the primary tablet information). - if !isClusterWideRecovery(checkAndRecoverFunctionCode) { + // for speed in shard-wide recoveries (by not holding the shard lock before reloading the primary tablet information). + if !isShardWideRecovery(checkAndRecoverFunctionCode) { if err = recheckPrimaryHealth(analysisEntry, DiscoverInstance); err != nil { return err } @@ -591,10 +591,10 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er logger.Errorf("Failed to refresh keyspace and shard, aborting recovery: %v", err) return err } - // If we are about to run a cluster-wide recovery, it is imperative to first refresh all the tablets - // of a shard because a new tablet could have been promoted, and we need to have this visibility before we - // run a cluster operation of our own. - if isClusterWideRecovery(checkAndRecoverFunctionCode) { + // If we are about to run a shard-wide recovery, it is imperative to first refresh all the tablets + // of a shard because a new tablet could have been promoted, and we need to have this visibility + // before we run a shard-wide operation of our own. + if isShardWideRecovery(checkAndRecoverFunctionCode) { var tabletsToIgnore []string if checkAndRecoverFunctionCode == recoverDeadPrimaryFunc { tabletsToIgnore = append(tabletsToIgnore, analysisEntry.AnalyzedInstanceAlias) @@ -604,7 +604,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er logger.Info("Force refreshing all shard tablets") forceRefreshAllTabletsInShard(ctx, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard, tabletsToIgnore) } else { - // If we are not running a cluster-wide recovery, then it is only concerned with the specific tablet + // If we are not running a shard-wide recovery, then it is only concerned with the specific tablet // on which the failure occurred and the primary instance of the shard. // For example, ConnectedToWrongPrimary analysis only cares for whom the current primary tablet is // and the host-port set on the tablet in question. @@ -668,11 +668,11 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er } else { logger.Infof("Topology recovery: %+v", topologyRecovery) } - // If we ran a cluster wide recovery and actually attempted it, then we know that the replication state for all the tablets in this cluster + // If we ran a shard-wide recovery and actually attempted it, then we know that the replication state for all the tablets in this cluster // would have changed. So we can go ahead and pre-emptively refresh them. // For this refresh we don't use the same context that we used for the recovery, since that context might have expired or could expire soon // Instead we pass the background context. The call forceRefreshAllTabletsInShard handles adding a timeout to it for us. - if isClusterWideRecovery(checkAndRecoverFunctionCode) { + if isShardWideRecovery(checkAndRecoverFunctionCode) { logger.Info("Forcing refresh of all tablets post recovery") forceRefreshAllTabletsInShard(context.Background(), analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard, nil) } else {