diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt index 18b5e236f74..7c7e4502587 100644 --- a/go/flags/endtoend/vtorc.txt +++ b/go/flags/endtoend/vtorc.txt @@ -17,6 +17,7 @@ vtorc \ Flags: --allow-emergency-reparent Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary (default true) + --allow-recovery Whether VTOrc should be allowed to run recovery actions (default true) --alsologtostderr log to standard error as well as files --audit-file-location string File location where the audit logs are to be stored --audit-purge-duration duration Duration for which audit logs are held before being purged. Should be in multiples of days (default 168h0m0s) diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index e3b55d64c6b..f79caeae08a 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -57,6 +57,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { "--topo_global_root", clusterInfo.ClusterInstance.VtctlProcess.TopoGlobalRoot, } servenv.ParseFlags("vtorc") + config.Config.AllowRecovery = true config.Config.RecoveryPeriodBlockSeconds = 1 config.Config.InstancePollSeconds = 1 config.MarkConfigurationLoaded() diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 5c020631748..8175bcb0d8b 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -62,6 +62,7 @@ var ( tolerableReplicationLag = 0 * time.Second topoInformationRefreshDuration = 15 * time.Second recoveryPollDuration = 1 * time.Second + allowRecovery = true ersEnabled = true convertTabletsWithErrantGTIDs = false ) @@ -83,6 +84,7 @@ func RegisterFlags(fs *pflag.FlagSet) { fs.DurationVar(&tolerableReplicationLag, "tolerable-replication-lag", tolerableReplicationLag, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery") + fs.BoolVar(&allowRecovery, "allow-recovery", allowRecovery, "Whether VTOrc should be allowed to run recovery actions") fs.BoolVar(&ersEnabled, "allow-emergency-reparent", ersEnabled, "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary") fs.BoolVar(&convertTabletsWithErrantGTIDs, "change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs, "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED") } @@ -106,6 +108,7 @@ type Configuration struct { WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. + AllowRecovery bool // Allow recoveries. RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs } @@ -137,6 +140,7 @@ func UpdateConfigValuesFromFlags() { Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) + Config.AllowRecovery = allowRecovery Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) } @@ -150,6 +154,11 @@ func SetERSEnabled(val bool) { ersEnabled = val } +// GetAllowRecovery is a getter function. +func GetAllowRecovery() bool { + return allowRecovery +} + // ConvertTabletWithErrantGTIDs reports whether VTOrc is allowed to change the tablet type of tablets with errant GTIDs to DRAINED. func ConvertTabletWithErrantGTIDs() bool { return convertTabletsWithErrantGTIDs @@ -181,6 +190,7 @@ func newConfiguration() *Configuration { PreventCrossDataCenterPrimaryFailover: false, WaitReplicasTimeoutSeconds: 30, TopoInformationRefreshSeconds: 15, + AllowRecovery: true, RecoveryPollSeconds: 1, } } diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index 6fe06c70e6a..f1f8839c978 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -355,6 +355,14 @@ func ContinuousDiscovery() { checkAndRecoverWaitPeriod := 3 * instancePollSecondsDuration() recentDiscoveryOperationKeys = cache.New(instancePollSecondsDuration(), time.Second) + if !config.GetAllowRecovery() { + log.Info("--allow-recovery is set to 'false', disabling recovery actions") + if err := DisableRecovery(); err != nil { + log.Errorf("failed to disable recoveries: %+v", err) + return + } + } + go handleDiscoveryRequests() healthTick := time.Tick(config.HealthPollSeconds * time.Second)