diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md
new file mode 100644
index 00000000000..3a8c3a17efd
--- /dev/null
+++ b/changelog/18.0/18.0.0/summary.md
@@ -0,0 +1,24 @@
+## Summary
+
+### Table of Contents
+
+- **[Major Changes](#major-changes)**
+ - **[Breaking Changes](#breaking-changes)**
+ - **[New command line flags and behavior](#new-flag)**
+ - [VTOrc flag `--allow-emergency-reparent`](#new-flag-toggle-ers)
+ - **[Deprecations and Deletions](#deprecations-and-deletions)**
+
+
+## Major Changes
+
+### Breaking Changes
+
+### New command line flags and behavior
+
+#### VTOrc flag `--allow-emergency-reparent`
+
+VTOrc has a new flag `--allow-emergency-reparent` that allows the users to toggle the ability of VTOrc to run emergency reparent operations.
+The users that want VTOrc to fix the replication issues, but don't want it to run any reparents should start using this flag.
+By default, VTOrc will be able to run `EmergencyReparentShard`. The users must specify the flag to `false` to change the behaviour.
+
+### Deprecations and Deletions
diff --git a/changelog/18.0/README.md b/changelog/18.0/README.md
new file mode 100644
index 00000000000..c12db42b0b6
--- /dev/null
+++ b/changelog/18.0/README.md
@@ -0,0 +1 @@
+## v18.0
diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt
index 3f184b7e8d5..bcf0f37bf66 100644
--- a/go/flags/endtoend/vtorc.txt
+++ b/go/flags/endtoend/vtorc.txt
@@ -1,4 +1,5 @@
Usage of vtorc:
+ --allow-emergency-reparent Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary (default true)
--alsologtostderr log to standard error as well as files
--audit-file-location string File location where the audit logs are to be stored
--audit-purge-duration duration Duration for which audit logs are held before being purged. Should be in multiples of days (default 168h0m0s)
diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go
index 3d3dde96034..73e55378bec 100644
--- a/go/vt/vtorc/config/config.go
+++ b/go/vt/vtorc/config/config.go
@@ -67,6 +67,7 @@ var (
waitReplicasTimeout = 30 * time.Second
topoInformationRefreshDuration = 15 * time.Second
recoveryPollDuration = 1 * time.Second
+ ersEnabled = true
)
// RegisterFlags registers the flags required by VTOrc
@@ -86,6 +87,7 @@ func RegisterFlags(fs *pflag.FlagSet) {
fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs")
fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server")
fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery")
+ fs.BoolVar(&ersEnabled, "allow-emergency-reparent", ersEnabled, "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary")
}
// Configuration makes for vtorc configuration input, which can be provided by user via JSON formatted file.
@@ -137,6 +139,16 @@ func UpdateConfigValuesFromFlags() {
Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second)
}
+// ERSEnabled reports whether VTOrc is allowed to run ERS or not.
+func ERSEnabled() bool {
+ return ersEnabled
+}
+
+// SetERSEnabled sets the value for the ersEnabled variable. This should only be used from tests.
+func SetERSEnabled(val bool) {
+ ersEnabled = val
+}
+
// LogConfigValues is used to log the config values.
func LogConfigValues() {
b, _ := json.MarshalIndent(Config, "", "\t")
diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go
index 3b2734bcda0..0d1e07f53cb 100644
--- a/go/vt/vtorc/logic/topology_recovery.go
+++ b/go/vt/vtorc/logic/topology_recovery.go
@@ -433,6 +433,10 @@ func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, analyzedInst
switch analysisCode {
// primary
case inst.DeadPrimary, inst.DeadPrimaryAndSomeReplicas:
+ // If ERS is disabled, we have no way of repairing the cluster.
+ if !config.ERSEnabled() {
+ return noRecoveryFunc
+ }
if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) {
return recoverGenericProblemFunc
}
diff --git a/go/vt/vtorc/logic/topology_recovery_test.go b/go/vt/vtorc/logic/topology_recovery_test.go
index 211d8ff3467..2945a796fcc 100644
--- a/go/vt/vtorc/logic/topology_recovery_test.go
+++ b/go/vt/vtorc/logic/topology_recovery_test.go
@@ -26,10 +26,9 @@ import (
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo/memorytopo"
+ "vitess.io/vitess/go/vt/vtorc/config"
"vitess.io/vitess/go/vt/vtorc/db"
"vitess.io/vitess/go/vt/vtorc/inst"
-
- // import the gRPC client implementation for tablet manager
_ "vitess.io/vitess/go/vt/vttablet/grpctmclient"
)
@@ -190,3 +189,70 @@ func TestDifferentAnalysescHaveDifferentCooldowns(t *testing.T) {
_, err = AttemptRecoveryRegistration(&primaryAnalysisEntry, true, true)
require.Nil(t, err)
}
+
+func TestGetCheckAndRecoverFunctionCode(t *testing.T) {
+ tests := []struct {
+ name string
+ ersEnabled bool
+ analysisCode inst.AnalysisCode
+ analyzedInstanceKey *inst.InstanceKey
+ wantRecoveryFunction recoveryFunction
+ }{
+ {
+ name: "DeadPrimary with ERS enabled",
+ ersEnabled: true,
+ analysisCode: inst.DeadPrimary,
+ analyzedInstanceKey: &inst.InstanceKey{
+ Hostname: hostname,
+ Port: 1,
+ },
+ wantRecoveryFunction: recoverDeadPrimaryFunc,
+ }, {
+ name: "DeadPrimary with ERS disabled",
+ ersEnabled: false,
+ analysisCode: inst.DeadPrimary,
+ analyzedInstanceKey: &inst.InstanceKey{
+ Hostname: hostname,
+ Port: 1,
+ },
+ wantRecoveryFunction: noRecoveryFunc,
+ }, {
+ name: "PrimaryHasPrimary",
+ ersEnabled: false,
+ analysisCode: inst.PrimaryHasPrimary,
+ wantRecoveryFunction: recoverPrimaryHasPrimaryFunc,
+ }, {
+ name: "ClusterHasNoPrimary",
+ ersEnabled: false,
+ analysisCode: inst.ClusterHasNoPrimary,
+ wantRecoveryFunction: electNewPrimaryFunc,
+ }, {
+ name: "ReplicationStopped",
+ ersEnabled: false,
+ analysisCode: inst.ReplicationStopped,
+ wantRecoveryFunction: fixReplicaFunc,
+ }, {
+ name: "PrimarySemiSyncMustBeSet",
+ ersEnabled: false,
+ analysisCode: inst.PrimarySemiSyncMustBeSet,
+ wantRecoveryFunction: fixPrimaryFunc,
+ },
+ }
+
+ // Needed for the test to work
+ oldMap := emergencyOperationGracefulPeriodMap
+ emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500)
+ defer func() {
+ emergencyOperationGracefulPeriodMap = oldMap
+ }()
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ prevVal := config.ERSEnabled()
+ config.SetERSEnabled(tt.ersEnabled)
+ defer config.SetERSEnabled(prevVal)
+
+ gotFunc := getCheckAndRecoverFunctionCode(tt.analysisCode, tt.analyzedInstanceKey)
+ require.EqualValues(t, tt.wantRecoveryFunction, gotFunc)
+ })
+ }
+}