diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md new file mode 100644 index 00000000000..3a8c3a17efd --- /dev/null +++ b/changelog/18.0/18.0.0/summary.md @@ -0,0 +1,24 @@ +## Summary + +### Table of Contents + +- **[Major Changes](#major-changes)** + - **[Breaking Changes](#breaking-changes)** + - **[New command line flags and behavior](#new-flag)** + - [VTOrc flag `--allow-emergency-reparent`](#new-flag-toggle-ers) + - **[Deprecations and Deletions](#deprecations-and-deletions)** + + +## Major Changes + +### Breaking Changes + +### New command line flags and behavior + +#### VTOrc flag `--allow-emergency-reparent` + +VTOrc has a new flag `--allow-emergency-reparent` that allows the users to toggle the ability of VTOrc to run emergency reparent operations. +The users that want VTOrc to fix the replication issues, but don't want it to run any reparents should start using this flag. +By default, VTOrc will be able to run `EmergencyReparentShard`. The users must specify the flag to `false` to change the behaviour. + +### Deprecations and Deletions diff --git a/changelog/18.0/README.md b/changelog/18.0/README.md new file mode 100644 index 00000000000..c12db42b0b6 --- /dev/null +++ b/changelog/18.0/README.md @@ -0,0 +1 @@ +## v18.0 diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt index 3f184b7e8d5..bcf0f37bf66 100644 --- a/go/flags/endtoend/vtorc.txt +++ b/go/flags/endtoend/vtorc.txt @@ -1,4 +1,5 @@ Usage of vtorc: + --allow-emergency-reparent Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary (default true) --alsologtostderr log to standard error as well as files --audit-file-location string File location where the audit logs are to be stored --audit-purge-duration duration Duration for which audit logs are held before being purged. Should be in multiples of days (default 168h0m0s) diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 3d3dde96034..73e55378bec 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -67,6 +67,7 @@ var ( waitReplicasTimeout = 30 * time.Second topoInformationRefreshDuration = 15 * time.Second recoveryPollDuration = 1 * time.Second + ersEnabled = true ) // RegisterFlags registers the flags required by VTOrc @@ -86,6 +87,7 @@ func RegisterFlags(fs *pflag.FlagSet) { fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery") + fs.BoolVar(&ersEnabled, "allow-emergency-reparent", ersEnabled, "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary") } // Configuration makes for vtorc configuration input, which can be provided by user via JSON formatted file. @@ -137,6 +139,16 @@ func UpdateConfigValuesFromFlags() { Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) } +// ERSEnabled reports whether VTOrc is allowed to run ERS or not. +func ERSEnabled() bool { + return ersEnabled +} + +// SetERSEnabled sets the value for the ersEnabled variable. This should only be used from tests. +func SetERSEnabled(val bool) { + ersEnabled = val +} + // LogConfigValues is used to log the config values. func LogConfigValues() { b, _ := json.MarshalIndent(Config, "", "\t") diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 3b2734bcda0..0d1e07f53cb 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -433,6 +433,10 @@ func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, analyzedInst switch analysisCode { // primary case inst.DeadPrimary, inst.DeadPrimaryAndSomeReplicas: + // If ERS is disabled, we have no way of repairing the cluster. + if !config.ERSEnabled() { + return noRecoveryFunc + } if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { return recoverGenericProblemFunc } diff --git a/go/vt/vtorc/logic/topology_recovery_test.go b/go/vt/vtorc/logic/topology_recovery_test.go index 211d8ff3467..2945a796fcc 100644 --- a/go/vt/vtorc/logic/topology_recovery_test.go +++ b/go/vt/vtorc/logic/topology_recovery_test.go @@ -26,10 +26,9 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo/memorytopo" + "vitess.io/vitess/go/vt/vtorc/config" "vitess.io/vitess/go/vt/vtorc/db" "vitess.io/vitess/go/vt/vtorc/inst" - - // import the gRPC client implementation for tablet manager _ "vitess.io/vitess/go/vt/vttablet/grpctmclient" ) @@ -190,3 +189,70 @@ func TestDifferentAnalysescHaveDifferentCooldowns(t *testing.T) { _, err = AttemptRecoveryRegistration(&primaryAnalysisEntry, true, true) require.Nil(t, err) } + +func TestGetCheckAndRecoverFunctionCode(t *testing.T) { + tests := []struct { + name string + ersEnabled bool + analysisCode inst.AnalysisCode + analyzedInstanceKey *inst.InstanceKey + wantRecoveryFunction recoveryFunction + }{ + { + name: "DeadPrimary with ERS enabled", + ersEnabled: true, + analysisCode: inst.DeadPrimary, + analyzedInstanceKey: &inst.InstanceKey{ + Hostname: hostname, + Port: 1, + }, + wantRecoveryFunction: recoverDeadPrimaryFunc, + }, { + name: "DeadPrimary with ERS disabled", + ersEnabled: false, + analysisCode: inst.DeadPrimary, + analyzedInstanceKey: &inst.InstanceKey{ + Hostname: hostname, + Port: 1, + }, + wantRecoveryFunction: noRecoveryFunc, + }, { + name: "PrimaryHasPrimary", + ersEnabled: false, + analysisCode: inst.PrimaryHasPrimary, + wantRecoveryFunction: recoverPrimaryHasPrimaryFunc, + }, { + name: "ClusterHasNoPrimary", + ersEnabled: false, + analysisCode: inst.ClusterHasNoPrimary, + wantRecoveryFunction: electNewPrimaryFunc, + }, { + name: "ReplicationStopped", + ersEnabled: false, + analysisCode: inst.ReplicationStopped, + wantRecoveryFunction: fixReplicaFunc, + }, { + name: "PrimarySemiSyncMustBeSet", + ersEnabled: false, + analysisCode: inst.PrimarySemiSyncMustBeSet, + wantRecoveryFunction: fixPrimaryFunc, + }, + } + + // Needed for the test to work + oldMap := emergencyOperationGracefulPeriodMap + emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500) + defer func() { + emergencyOperationGracefulPeriodMap = oldMap + }() + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + prevVal := config.ERSEnabled() + config.SetERSEnabled(tt.ersEnabled) + defer config.SetERSEnabled(prevVal) + + gotFunc := getCheckAndRecoverFunctionCode(tt.analysisCode, tt.analyzedInstanceKey) + require.EqualValues(t, tt.wantRecoveryFunction, gotFunc) + }) + } +}