Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flag to keep node cordoned for reap failures #60

Merged
merged 10 commits into from
Nov 15, 2021
1 change: 1 addition & 0 deletions cmd/governor/app/nodereaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,5 @@ func init() {
nodeReapCmd.Flags().StringArrayVar(&nodeReaperArgs.ReapTainted, "reap-tainted", []string{}, "marks nodes with a given taint reapable, must be in format of comma separated taints key=value:effect, key:effect or key")
nodeReapCmd.Flags().Float64Var(&nodeReaperArgs.ReconsiderUnreapableAfter, "reconsider-unreapable-after", 45, "Time (in minutes) after which reconsider unreapable nodes")
nodeReapCmd.Flags().Int64Var(&nodeReaperArgs.DrainTimeoutSeconds, "drain-timeout", 600, "Time (in seconds) to wait before drain command timeout")
nodeReapCmd.Flags().BoolVar(&nodeReaperArgs.ReapFlappy, "ignore-failure", false, "Keep the failed node as cordoned")
}
1 change: 1 addition & 0 deletions examples/helm-chart/governor/templates/node-reaper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,5 @@ spec:
- --reap-old-throttle={{ .Values.reaper.nodereaper.reapoldthrottle }}
{{- end }}
- --max-kill-nodes={{ .Values.reaper.nodereaper.maxkillnodes }}
- --ignore-failure={{ .Values.reaper.nodereaper.ignoreFailure }}
{{- end }}
2 changes: 2 additions & 0 deletions examples/helm-chart/governor/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,5 @@ reaper:
reapoldthrottle: 3600
# Kill a maximum of 1 nodes per run considering throttle wait times
maxkillnodes: 1
# Ignore Failure and keep the node as cordones
ignoreFailure: false
2 changes: 2 additions & 0 deletions examples/node-reaper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,5 @@ spec:
- --reap-old-throttle=3600
# Kill a maximum of 1 nodes per run considering throttle wait times
- --max-kill-nodes=1
# Ignore Failure and keep the node as Cordoned, default False
- --ignore-failure=false
10 changes: 5 additions & 5 deletions pkg/reaper/nodereaper/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,11 @@ func runCommandWithContext(call string, args []string, timeoutSeconds int64) (st
return string(out), nil
}

func (ctx *ReaperContext) uncordonNode(name string, dryRun bool) error {
func (ctx *ReaperContext) uncordonNode(name string, dryRun bool, ingoreDrainFailure bool) error {
uncordonArgs := []string{"uncordon", name}
uncordonCommand := ctx.KubectlLocalPath
if dryRun {
log.Warnf("dry run is on, instance not uncordoned")
if dryRun || ingoreDrainFailure {
log.Warnf("dry run / ignore drain failure is on, instance not uncordoned")
shailshah9 marked this conversation as resolved.
Show resolved Hide resolved
} else {
_, err := runCommand(uncordonCommand, uncordonArgs)
if err != nil {
Expand Down Expand Up @@ -154,11 +154,11 @@ func (ctx *ReaperContext) drainNode(name string, dryRun bool) error {
if err.Error() == "command execution timed out" {
log.Warnf("failed to drain node %v, drain command timed-out", name)
ctx.annotateNode(name, ageUnreapableAnnotationKey, getUTCNowStr())
ctx.uncordonNode(name, dryRun)
ctx.uncordonNode(name, dryRun, ctx.IgnoreFailure)
return err
}
log.Warnf("failed to drain node: %v", err)
ctx.uncordonNode(name, dryRun)
ctx.uncordonNode(name, dryRun, ctx.IgnoreFailure)
return err
}
ctx.DrainedInstances++
Expand Down
2 changes: 2 additions & 0 deletions pkg/reaper/nodereaper/nodereaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ func (ctx *ReaperContext) validateArguments(args *Args) error {
ctx.EC2Region = args.EC2Region
ctx.ReapOld = args.ReapOld
ctx.MaxKill = args.MaxKill
ctx.IgnoreFailure = args.IgnoreFailure

log.Infof("AWS Region = %v", ctx.EC2Region)
log.Infof("Dry Run = %t", ctx.DryRun)
Expand Down Expand Up @@ -175,6 +176,7 @@ func (ctx *ReaperContext) validateArguments(args *Args) error {
log.Infof("Reap Unjoined = %t, threshold = %v minutes by tag %v=%v", ctx.ReapUnjoined, ctx.ReapUnjoinedThresholdMinutes, ctx.ReapUnjoinedKey, ctx.ReapUnjoinedValue)
log.Infof("Reconsider Unreapable after = %v minutes", ctx.ReconsiderUnreapableAfter)
log.Infof("Drain Timeout = %d seconds", ctx.DrainTimeoutSeconds)
log.Infof("Ignore Failure = %t", ctx.IgnoreFailure)

if !ctx.SoftReap {
log.Warnf("--soft-reap is off !! will not consider pods when reaping")
Expand Down
28 changes: 28 additions & 0 deletions pkg/reaper/nodereaper/nodereaper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ func newFakeReaperContext() *ReaperContext {
ctx.ReapOldThresholdMinutes = 36000
ctx.MaxKill = 3
ctx.DrainTimeoutSeconds = 600
ctx.IgnoreFailure = false
loadFakeAPI(&ctx)
return &ctx
}
Expand Down Expand Up @@ -838,6 +839,33 @@ func TestReapOldDisabled(t *testing.T) {
testCase.Run(t, false)
}

func TestIgnoreReapFailure(t *testing.T) {
reaper := newFakeReaperContext()
reaper.IgnoreFailure = true

testCase := ReaperUnitTest{
TestDescription: "Ignore failure - old nodes should be cordoned",
InstanceGroup: FakeASG{
Name: "my-ig.cluster.k8s.local",
Healthy: 0,
Unhealthy: 1,
Desired: 1,
},
Nodes: []FakeNode{
{
state: "Unknown",
ageMinutes: 43200,
},
},
FakeReaper: reaper,
ExpectedUnready: 1,
ExpectedOldReapable: 1,
ExpectedTerminated: 0,
ExpectedDrained: 0,
}
testCase.Run(t, false)
}

func TestReapOldSelfEviction(t *testing.T) {
reaper := newFakeReaperContext()

Expand Down
2 changes: 2 additions & 0 deletions pkg/reaper/nodereaper/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ type Args struct {
ReapTainted []string
ReconsiderUnreapableAfter float64
DrainTimeoutSeconds int64
IgnoreFailure bool
}

// ReaperContext holds the context of the node-reaper and target cluster
Expand Down Expand Up @@ -92,6 +93,7 @@ type ReaperContext struct {
ReapTainted []v1.Taint
ReconsiderUnreapableAfter float64
DrainTimeoutSeconds int64
IgnoreFailure bool
// runtime
UnreadyNodes []v1.Node
AllNodes []v1.Node
Expand Down