Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
71131d5
`vtorc`: add problem ordering by shard, fix semi-sync deploy
timvaillancourt Feb 18, 2026
b0700ef
better comment
timvaillancourt Feb 19, 2026
dcce800
update test
timvaillancourt Feb 19, 2026
04b4399
use detectionAnalysisPriorityCritical
timvaillancourt Feb 19, 2026
156f49f
redundant tests
timvaillancourt Feb 19, 2026
50c6a00
move problems to slice instead
timvaillancourt Feb 19, 2026
713c65b
improve comment
timvaillancourt Feb 19, 2026
5092ca9
improve log
timvaillancourt Feb 22, 2026
51f4b25
Merge remote-tracking branch 'origin/main' into vtorc-analysis-refactor
timvaillancourt Feb 23, 2026
9bb879c
validate the ordered/concurrent recovery sorting
timvaillancourt Feb 23, 2026
2f3b04c
Merge remote-tracking branch 'origin/main' into vtorc-analysis-refactor
timvaillancourt Feb 23, 2026
4764d65
rm `HasShardWideAction bool`, just make it a priority int
timvaillancourt Feb 24, 2026
6d80f20
fix comments
timvaillancourt Feb 24, 2026
9ddeff9
shuffle map keys in CheckAndRecover()
timvaillancourt Feb 24, 2026
9f48f53
add code comments
timvaillancourt Feb 24, 2026
91d3264
use `sync.WaitGroup` in `CheckAndRecover()`
timvaillancourt Feb 24, 2026
78b2a36
Merge remote-tracking branch 'origin/main' into vtorc-analysis-refactor
timvaillancourt Feb 24, 2026
52cbe38
fix lint
timvaillancourt Feb 24, 2026
fc092a3
Add `a.IsClusterPrimary` to `PrimarySemiSyncBlocked`
timvaillancourt Feb 24, 2026
be8a291
rm defer
timvaillancourt Feb 24, 2026
25751e6
Revert "use `sync.WaitGroup` in `CheckAndRecover()`"
timvaillancourt Feb 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ package general

import (
"context"
"encoding/json"
"fmt"
"strconv"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -884,3 +886,85 @@ func TestFullStatusConnectionPooling(t *testing.T) {
return err == nil && status == 200 && strings.TrimSpace(resp) == "null"
}, 90*time.Second, time.Second, "timed out waiting for replication analysis to clear")
}

// TestSemiSyncRecoveryOrdering verifies that when the durability policy changes
// to semi_sync, VTOrc fixes ReplicaSemiSyncMustBeSet before PrimarySemiSyncMustBeSet.
// This ordering is enforced by the AfterAnalyses/BeforeAnalyses dependencies.
func TestSemiSyncRecoveryOrdering(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
// Start with durability "none" so no semi-sync is required initially.
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{
PreventCrossCellFailover: true,
}, cluster.DefaultVtorcsByCell, policy.DurabilityNone)
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// Wait for primary election and healthy replication.
primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, primary, "should have elected a primary")
utils.CheckReplication(t, clusterInfo, primary, shard0.Vttablets, 10*time.Second)

vtorc := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtorc, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1)

// Change durability to semi_sync. VTOrc should detect that replicas and primary
// need semi-sync enabled, and fix them in the correct order.
out, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput(
"SetKeyspaceDurabilityPolicy", keyspace.Name, "--durability-policy="+policy.DurabilitySemiSync)
require.NoError(t, err, out)

// Poll the database-state API to verify recovery ordering.
// The topology_recovery table has auto-incremented recovery_id values that
// reflect execution order. All ReplicaSemiSyncMustBeSet recovery_ids should
// be less than any PrimarySemiSyncMustBeSet recovery_id.
type tableState struct {
TableName string
Rows []map[string]any
}

assert.EventuallyWithT(t, func(c *assert.CollectT) {
status, response, err := utils.MakeAPICall(t, vtorc, "/api/database-state")
assert.NoError(c, err)
assert.Equal(c, 200, status)

var tables []tableState
if !assert.NoError(c, json.Unmarshal([]byte(response), &tables)) {
return
}

var maxReplicaRecoveryID, minPrimaryRecoveryID int
var replicaCount, primaryCount int
for _, table := range tables {
if table.TableName != "topology_recovery" {
continue
}
for _, row := range table.Rows {
analysis, _ := row["analysis"].(string)
recoveryIDStr, _ := row["recovery_id"].(string)
recoveryID, err := strconv.Atoi(recoveryIDStr)
if err != nil {
continue
}
switch inst.AnalysisCode(analysis) {
case inst.ReplicaSemiSyncMustBeSet:
replicaCount++
if replicaCount == 1 || recoveryID > maxReplicaRecoveryID {
maxReplicaRecoveryID = recoveryID
}
case inst.PrimarySemiSyncMustBeSet:
primaryCount++
if primaryCount == 1 || recoveryID < minPrimaryRecoveryID {
minPrimaryRecoveryID = recoveryID
}
}
}
}

assert.Greater(c, replicaCount, 0, "should have ReplicaSemiSyncMustBeSet recoveries")
assert.Greater(c, primaryCount, 0, "should have PrimarySemiSyncMustBeSet recoveries")
if replicaCount > 0 && primaryCount > 0 {
assert.Less(c, maxReplicaRecoveryID, minPrimaryRecoveryID,
"all ReplicaSemiSyncMustBeSet recoveries should have lower recovery_id than PrimarySemiSyncMustBeSet")
}
}, 30*time.Second, time.Second)
}
13 changes: 13 additions & 0 deletions go/vt/vtorc/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"time"

topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/vtctl/reparentutil/policy"
"vitess.io/vitess/go/vt/vtorc/config"
)

Expand Down Expand Up @@ -117,11 +118,13 @@ type DetectionAnalysis struct {
CountReplicas uint
CountValidReplicas uint
CountValidReplicatingReplicas uint
CountValidSemiSyncReplicatingReplicas uint
ReplicationStopped bool
ErrantGTID string
ReplicaNetTimeout int32
HeartbeatInterval float64
Analysis AnalysisCode
AnalysisMatchedProblems []*DetectionAnalysisProblemMeta
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we store ALL problems a tablet matched, in priority order

We only pick the top-priority problem for the tablet, but storing all matches is useful for debugging purposes

Description string
StructureAnalysis []StructureAnalysisCode
OracleGTIDImmediateTopology bool
Expand Down Expand Up @@ -150,6 +153,16 @@ type DetectionAnalysis struct {
IsDiskStalled bool
}

// hasMinSemiSyncAckers returns true if there are a minimum number of semi-sync ackers enabled and replicating.
// True is always returned if the durability policy does not require semi-sync ackers (eg: "none"). This gives
// a useful signal if it is safe to enable semi-sync without risk of stalling ongoing PRIMARY writes.
func hasMinSemiSyncAckers(durabler policy.Durabler, primary *topodatapb.Tablet, analysis *DetectionAnalysis) bool {
if durabler == nil || analysis == nil {
return false
}
return int(analysis.CountValidSemiSyncReplicatingReplicas) >= durabler.SemiSyncAckers(primary)
}

func (detectionAnalysis *DetectionAnalysis) MarshalJSON() ([]byte, error) {
i := struct {
DetectionAnalysis
Expand Down
186 changes: 34 additions & 152 deletions go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package inst

import (
"fmt"
"math"
"time"

"github.com/patrickmn/go-cache"
Expand Down Expand Up @@ -196,6 +195,15 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
),
0
) AS count_valid_semi_sync_replicas,
IFNULL(
SUM(
replica_instance.last_checked <= replica_instance.last_seen
AND replica_instance.replica_io_running != 0
AND replica_instance.replica_sql_running != 0
AND replica_instance.semi_sync_replica_enabled != 0
),
0
) AS count_valid_semi_sync_replicating_replicas,
IFNULL(
SUM(
replica_instance.log_bin
Expand Down Expand Up @@ -351,6 +359,7 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
a.SemiSyncBlocked = m.GetBool("semi_sync_blocked")
a.SemiSyncReplicaEnabled = m.GetBool("semi_sync_replica_enabled")
a.CountSemiSyncReplicasEnabled = m.GetUint("count_semi_sync_replicas")
a.CountValidSemiSyncReplicatingReplicas = m.GetUint("count_valid_semi_sync_replicating_replicas")
// countValidSemiSyncReplicasEnabled := m.GetUint("count_valid_semi_sync_replicas")
a.SemiSyncPrimaryWaitForReplicaCount = m.GetUint("semi_sync_primary_wait_for_replica_count")
a.SemiSyncPrimaryClients = m.GetUint("semi_sync_primary_clients")
Expand Down Expand Up @@ -406,165 +415,38 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
// Increment the total number of tablets.
ca.totalTablets += 1
if ca.hasShardWideAction {
// We can only take one shard level action at a time.
// We can only take one shard-wide action at a time.
return nil
}
if ca.durability == nil {
// We failed to load the durability policy, so we shouldn't run any analysis
return nil
}
isInvalid := m.GetBool("is_invalid")
switch {
case a.IsClusterPrimary && isInvalid:
a.Analysis = InvalidPrimary
a.Description = "VTOrc hasn't been able to reach the primary even once since restart/shutdown"
case isInvalid:
a.Analysis = InvalidReplica
a.Description = "VTOrc hasn't been able to reach the replica even once since restart/shutdown"
case a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled:
a.Analysis = PrimaryDiskStalled
a.Description = "Primary has a stalled disk"
ca.hasShardWideAction = true
case a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0:
a.Analysis = DeadPrimaryWithoutReplicas
a.Description = "Primary cannot be reached by vtorc and has no replica"
ca.hasShardWideAction = true
//
case a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0:
a.Analysis = DeadPrimary
a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
ca.hasShardWideAction = true
//
case a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0:
a.Analysis = DeadPrimaryAndReplicas
a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
ca.hasShardWideAction = true
//
case a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0:
a.Analysis = DeadPrimaryAndSomeReplicas
a.Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating"
ca.hasShardWideAction = true
//
case a.IsClusterPrimary && !a.LastCheckValid && a.PrimaryHealthUnhealthy:
a.Analysis = IncapacitatedPrimary
a.Description = "Primary is consistently timing out on health checks and may be incapacitated"
ca.hasShardWideAction = true
//
case a.IsClusterPrimary && !a.IsPrimary:
a.Analysis = PrimaryHasPrimary
a.Description = "Primary is replicating from somewhere else"
ca.hasShardWideAction = true
//
case a.IsClusterPrimary && a.IsReadOnly:
a.Analysis = PrimaryIsReadOnly
a.Description = "Primary is read-only"
//
case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled:
a.Analysis = PrimarySemiSyncMustBeSet
a.Description = "Primary semi-sync must be set"
//
case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled:
a.Analysis = PrimarySemiSyncMustNotBeSet
a.Description = "Primary semi-sync must not be set"
//
case a.IsClusterPrimary && a.CurrentTabletType != topodatapb.TabletType_UNKNOWN && a.CurrentTabletType != topodatapb.TabletType_PRIMARY:
a.Analysis = PrimaryCurrentTypeMismatch
a.Description = "Primary tablet's current type is not PRIMARY"
case isStaleTopoPrimary(a, ca):
a.Analysis = StaleTopoPrimary
a.Description = "Primary tablet is stale, older than current primary"
case topo.IsReplicaType(a.TabletType) && a.ErrantGTID != "":
a.Analysis = ErrantGTIDDetected
a.Description = "Tablet has errant GTIDs"
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp.IsZero():
// ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either.
a.Analysis = ClusterHasNoPrimary
a.Description = "Cluster has no primary"
ca.hasShardWideAction = true
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && !a.ShardPrimaryTermTimestamp.IsZero():
// If there are no primary tablets, but the shard primary start time isn't empty, then we know
// the primary tablet was deleted.
a.Analysis = PrimaryTabletDeleted
a.Description = "Primary tablet has been deleted"
ca.hasShardWideAction = true
case topo.IsReplicaType(a.TabletType) && a.IsPrimary:
a.Analysis = NotConnectedToPrimary
a.Description = "Not connected to the primary"
//
case a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount:
// The primary is reporting that semi-sync monitor is blocked on writes.
// There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked.
// There is some network diruption in progress. We should run an ERS.
a.Analysis = PrimarySemiSyncBlocked
a.Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs"
ca.hasShardWideAction = true
case topo.IsReplicaType(a.TabletType) && !a.IsReadOnly:
a.Analysis = ReplicaIsWritable
a.Description = "Replica is writable"
//
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && math.Round(a.HeartbeatInterval*2) != float64(a.ReplicaNetTimeout):
a.Analysis = ReplicaMisconfigured
a.Description = "Replica has been misconfigured"
//
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != "" && a.AnalyzedInstancePrimaryAlias != ca.primaryAlias:
a.Analysis = ConnectedToWrongPrimary
a.Description = "Connected to wrong primary"
//
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && a.ReplicationStopped:
a.Analysis = ReplicationStopped
a.Description = "Replication is stopped"
//
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && !a.SemiSyncReplicaEnabled:
a.Analysis = ReplicaSemiSyncMustBeSet
a.Description = "Replica semi-sync must be set"
//
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && a.SemiSyncReplicaEnabled:
a.Analysis = ReplicaSemiSyncMustNotBeSet
a.Description = "Replica semi-sync must not be set"
//
// TODO(sougou): Events below here are either ignored or not possible.
case a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0:
a.Analysis = UnreachablePrimaryWithLaggingReplicas
a.Description = "Primary cannot be reached by vtorc and all of its replicas are lagging"
//
case a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == a.CountValidReplicas:
// partial success is here to reduce noise
a.Analysis = UnreachablePrimary
a.Description = "Primary cannot be reached by vtorc but all of its replicas seem to be replicating; possibly a network/host issue"
//
case a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 && a.CountValidReplicatingReplicas < a.CountValidReplicas:
// partial success is here to reduce noise
a.Analysis = UnreachablePrimaryWithBrokenReplicas
a.Description = "Primary cannot be reached by vtorc but it has (some, but not all) replicating replicas; possibly a network/host issue"
//
case a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount:
if isStaleBinlogCoordinates {
a.Analysis = LockedSemiSyncPrimary
a.Description = "Semi sync primary is locked since it doesn't get enough replica acknowledgements"
} else {
a.Analysis = LockedSemiSyncPrimaryHypothesis
a.Description = "Semi sync primary seems to be locked, more samplings needed to validate"
var matchedProblems []*DetectionAnalysisProblem
for _, problem := range detectionAnalysisProblems {
// When isInvalid is true, instance data is unreliable (never been reached).
// Only InvalidPrimary/InvalidReplica should match; postProcessAnalyses
// handles upgrading InvalidPrimary to DeadPrimary if needed.
if isInvalid && problem.Meta.Analysis != InvalidPrimary && problem.Meta.Analysis != InvalidReplica {
continue
}
if problem.HasMatch(a, ca, primaryTablet, tablet, isInvalid, isStaleBinlogCoordinates) {
matchedProblems = append(matchedProblems, problem)
}
}
if len(matchedProblems) > 0 {
sortDetectionAnalysisMatchedProblems(matchedProblems)
for _, problem := range matchedProblems {
a.AnalysisMatchedProblems = append(a.AnalysisMatchedProblems, problem.Meta)
}
//
case a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0:
a.Analysis = PrimarySingleReplicaNotReplicating
a.Description = "Primary is reachable but its single replica is not replicating"
case a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0:
a.Analysis = PrimarySingleReplicaDead
a.Description = "Primary is reachable but its single replica is dead"
//
case a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0:
a.Analysis = AllPrimaryReplicasNotReplicating
a.Description = "Primary is reachable but none of its replicas is replicating"
//
case a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0:
a.Analysis = AllPrimaryReplicasNotReplicatingOrDead
a.Description = "Primary is reachable but none of its replicas is replicating"
//
// case a.IsPrimary && a.CountReplicas == 0:
// a.Analysis = PrimaryWithoutReplicas
// a.Description = "Primary has no replicas"
// }
// We return a single problem per tablet. Any remaining problems will be discovered/recovered
// by VTOrc(s) on future polls. Often many problems are resolved by a single recovery of the
// first problem. The first element of matchedProblems is the highest-priority problem.
chosenProblem := matchedProblems[0]
a.Analysis = chosenProblem.Meta.Analysis
a.Description = chosenProblem.Meta.Description
ca.hasShardWideAction = chosenProblem.Meta.Priority == detectionAnalysisPriorityShardWideAction
}

{
Expand Down
Loading
Loading