vitessio · timvaillancourt · Jun 30, 2025 · Jun 11, 2025 · Jun 23, 2025 · Jun 23, 2025
@@ -51,10 +51,10 @@ func initializeAnalysisDaoPostConfiguration() {
 }
 
 type clusterAnalysis struct {
-	hasClusterwideAction bool
-	totalTablets         int
-	primaryAlias         string
-	durability           policy.Durabler
+	hasShardWideAction bool
+	totalTablets       int
+	primaryAlias       string
+	durability         policy.Durabler
 }
 
 // GetReplicationAnalysis will check for replication problems (dead primary; unreachable primary; etc)
@@ -396,8 +396,8 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
 		ca := clusters[keyspaceShard]
 		// Increment the total number of tablets.
 		ca.totalTablets += 1
-		if ca.hasClusterwideAction {
-			// We can only take one cluster level action at a time.
+		if ca.hasShardWideAction {
+			// We can only take one shard level action at a time.
 			return nil
 		}
 		if ca.durability == nil {
@@ -414,31 +414,31 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
 		} else if a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled {
 			a.Analysis = PrimaryDiskStalled
 			a.Description = "Primary has a stalled disk"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 		} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 {
 			a.Analysis = DeadPrimaryWithoutReplicas
 			a.Description = "Primary cannot be reached by vtorc and has no replica"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 			//
 		} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
 			a.Analysis = DeadPrimary
 			a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 			//
 		} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 {
 			a.Analysis = DeadPrimaryAndReplicas
 			a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 			//
 		} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
 			a.Analysis = DeadPrimaryAndSomeReplicas
 			a.Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 			//
 		} else if a.IsClusterPrimary && !a.IsPrimary {
 			a.Analysis = PrimaryHasPrimary
 			a.Description = "Primary is replicating from somewhere else"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 			//
 		} else if a.IsClusterPrimary && a.IsReadOnly {
 			a.Analysis = PrimaryIsReadOnly
@@ -462,20 +462,20 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
 			// ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either.
 			a.Analysis = ClusterHasNoPrimary
 			a.Description = "Cluster has no primary"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 		} else if topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp != "" {
 			// If there are no primary tablets, but the shard primary start time isn't empty, then we know
 			// the primary tablet was deleted.
 			a.Analysis = PrimaryTabletDeleted
 			a.Description = "Primary tablet has been deleted"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 		} else if a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount {
 			// The primary is reporting that semi-sync monitor is blocked on writes.
 			// There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked.
 			// There is some network diruption in progress. We should run an ERS.
 			a.Analysis = PrimarySemiSyncBlocked
 			a.Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs"
-			ca.hasClusterwideAction = true
+			ca.hasShardWideAction = true
 		} else if topo.IsReplicaType(a.TabletType) && !a.IsReadOnly {
 			a.Analysis = ReplicaIsWritable
 			a.Description = "Replica is writable"

@@ -485,8 +485,8 @@ func getRecoverFunctionName(recoveryFunctionCode recoveryFunction) string {
 	}
 }
 
-// isClusterWideRecovery returns whether the given recovery is a cluster-wide recovery or not
-func isClusterWideRecovery(recoveryFunctionCode recoveryFunction) bool {
+// isShardWideRecovery returns whether the given recovery is a recovery that affects all tablets in a shard
+func isShardWideRecovery(recoveryFunctionCode recoveryFunction) bool {
 	switch recoveryFunctionCode {
 	case recoverDeadPrimaryFunc, electNewPrimaryFunc, recoverPrimaryTabletDeletedFunc:
 		return true
@@ -553,13 +553,13 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 
 	// Prioritise primary recovery.
 	// If we are performing some other action, first ensure that it is not because of primary issues.
-	// This step is only meant to improve the time taken to detect and fix cluster wide recoveries, it does not impact correctness.
+	// This step is only meant to improve the time taken to detect and fix shard-wide recoveries, it does not impact correctness.
 	// If a VTOrc detects an issue on a replica like ReplicationStopped, the underlying cause could be a dead primary instead.
 	// So, we try to reload that primary's information before proceeding with the replication stopped fix. We do this before acquiring the shard lock
 	// to allow another VTOrc instance to proceed with the dead primary recovery if it is indeed the case and it detects it before us. If however, the primary
 	// is not dead, then we will proceed with the fix for the replica. Essentially, we are trading off speed in replica recoveries (by doing an additional primary tablet reload)
-	// for speed in cluster-wide recoveries (by not holding the shard lock before reloading the primary tablet information).
-	if !isClusterWideRecovery(checkAndRecoverFunctionCode) {
+	// for speed in shard-wide recoveries (by not holding the shard lock before reloading the primary tablet information).
+	if !isShardWideRecovery(checkAndRecoverFunctionCode) {
 		if err = recheckPrimaryHealth(analysisEntry, DiscoverInstance); err != nil {
 			return err
 		}
@@ -591,10 +591,10 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 			logger.Errorf("Failed to refresh keyspace and shard, aborting recovery: %v", err)
 			return err
 		}
-		// If we are about to run a cluster-wide recovery, it is imperative to first refresh all the tablets
-		// of a shard because a new tablet could have been promoted, and we need to have this visibility before we
-		// run a cluster operation of our own.
-		if isClusterWideRecovery(checkAndRecoverFunctionCode) {
+		// If we are about to run a shard-wide recovery, it is imperative to first refresh all the tablets
+		// of a shard because a new tablet could have been promoted, and we need to have this visibility
+		// before we run a shard-wide operation of our own.
+		if isShardWideRecovery(checkAndRecoverFunctionCode) {
 			var tabletsToIgnore []string
 			if checkAndRecoverFunctionCode == recoverDeadPrimaryFunc {
 				tabletsToIgnore = append(tabletsToIgnore, analysisEntry.AnalyzedInstanceAlias)
@@ -604,7 +604,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 			logger.Info("Force refreshing all shard tablets")
 			forceRefreshAllTabletsInShard(ctx, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard, tabletsToIgnore)
 		} else {
-			// If we are not running a cluster-wide recovery, then it is only concerned with the specific tablet
+			// If we are not running a shard-wide recovery, then it is only concerned with the specific tablet
 			// on which the failure occurred and the primary instance of the shard.
 			// For example, ConnectedToWrongPrimary analysis only cares for whom the current primary tablet is
 			// and the host-port set on the tablet in question.
@@ -668,11 +668,11 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 	} else {
 		logger.Infof("Topology recovery: %+v", topologyRecovery)
 	}
-	// If we ran a cluster wide recovery and actually attempted it, then we know that the replication state for all the tablets in this cluster
+	// If we ran a shard-wide recovery and actually attempted it, then we know that the replication state for all the tablets in this cluster
 	// would have changed. So we can go ahead and pre-emptively refresh them.
 	// For this refresh we don't use the same context that we used for the recovery, since that context might have expired or could expire soon
 	// Instead we pass the background context. The call forceRefreshAllTabletsInShard handles adding a timeout to it for us.
-	if isClusterWideRecovery(checkAndRecoverFunctionCode) {
+	if isShardWideRecovery(checkAndRecoverFunctionCode) {
 		logger.Info("Forcing refresh of all tablets post recovery")
 		forceRefreshAllTabletsInShard(context.Background(), analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard, nil)
 	} else {