Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flag to keep node cordoned for reap failures #60

Merged
merged 10 commits into from
Nov 15, 2021
1 change: 1 addition & 0 deletions cmd/governor/app/nodereaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,5 @@ func init() {
nodeReapCmd.Flags().StringArrayVar(&nodeReaperArgs.ReapTainted, "reap-tainted", []string{}, "marks nodes with a given taint reapable, must be in format of comma separated taints key=value:effect, key:effect or key")
nodeReapCmd.Flags().Float64Var(&nodeReaperArgs.ReconsiderUnreapableAfter, "reconsider-unreapable-after", 45, "Time (in minutes) after which reconsider unreapable nodes")
nodeReapCmd.Flags().Int64Var(&nodeReaperArgs.DrainTimeoutSeconds, "drain-timeout", 600, "Time (in seconds) to wait before drain command timeout")
nodeReapCmd.Flags().BoolVar(&nodeReaperArgs.IgnoreFailure, "ignore-failure", false, "Keep the failed node as cordoned")
}
1 change: 1 addition & 0 deletions examples/helm-chart/governor/templates/node-reaper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,5 @@ spec:
- --reap-old-throttle={{ .Values.reaper.nodereaper.reapoldthrottle }}
{{- end }}
- --max-kill-nodes={{ .Values.reaper.nodereaper.maxkillnodes }}
- --ignore-failure={{ .Values.reaper.nodereaper.ignoreFailure }}
{{- end }}
2 changes: 2 additions & 0 deletions examples/helm-chart/governor/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,5 @@ reaper:
reapoldthrottle: 3600
# Kill a maximum of 1 nodes per run considering throttle wait times
maxkillnodes: 1
# Ignore Failure and keep the node as cordones
ignoreFailure: false
2 changes: 2 additions & 0 deletions examples/node-reaper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,5 @@ spec:
- --reap-old-throttle=3600
# Kill a maximum of 1 nodes per run considering throttle wait times
- --max-kill-nodes=1
# Ignore Failure and keep the node as Cordoned, default False
- --ignore-failure=false
12 changes: 6 additions & 6 deletions pkg/reaper/nodereaper/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,15 @@ func runCommandWithContext(call string, args []string, timeoutSeconds int64) (st
return string(out), nil
}

func (ctx *ReaperContext) uncordonNode(name string, dryRun bool) error {
func (ctx *ReaperContext) uncordonNode(name string, dryRun bool, ignoreDrainFailure bool) error {
uncordonArgs := []string{"uncordon", name}
uncordonCommand := ctx.KubectlLocalPath
if dryRun {
log.Warnf("dry run is on, instance not uncordoned")
if dryRun || ignoreDrainFailure {
log.Warnf("dry run / ignore drain failure is on, instance %v remains cordoned", name)
} else {
_, err := runCommand(uncordonCommand, uncordonArgs)
if err != nil {
log.Errorf("failed to uncrodon node %v", name)
log.Errorf("failed to uncordon node %v", name)
return err
}
}
Expand Down Expand Up @@ -154,11 +154,11 @@ func (ctx *ReaperContext) drainNode(name string, dryRun bool) error {
if err.Error() == "command execution timed out" {
log.Warnf("failed to drain node %v, drain command timed-out", name)
ctx.annotateNode(name, ageUnreapableAnnotationKey, getUTCNowStr())
ctx.uncordonNode(name, dryRun)
ctx.uncordonNode(name, dryRun, ctx.IgnoreFailure)
return err
}
log.Warnf("failed to drain node: %v", err)
ctx.uncordonNode(name, dryRun)
ctx.uncordonNode(name, dryRun, ctx.IgnoreFailure)
return err
}
ctx.DrainedInstances++
Expand Down
81 changes: 81 additions & 0 deletions pkg/reaper/nodereaper/nodereaper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ func newFakeReaperContext() *ReaperContext {
ctx.ReapOldThresholdMinutes = 36000
ctx.MaxKill = 3
ctx.DrainTimeoutSeconds = 600
ctx.IgnoreFailure = false
loadFakeAPI(&ctx)
return &ctx
}
Expand Down Expand Up @@ -838,6 +839,32 @@ func TestReapOldDisabled(t *testing.T) {
testCase.Run(t, false)
}

func TestIgnoreReapFailure(t *testing.T) {
reaper := newFakeReaperContext()
reaper.IgnoreFailure = true
testCase := ReaperUnitTest{
TestDescription: "Ignore failure - old nodes should be cordoned",
InstanceGroup: FakeASG{
Name: "my-ig.cluster.k8s.local",
Healthy: 0,
Unhealthy: 1,
Desired: 1,
},
Nodes: []FakeNode{
{
state: "Unknown",
ageMinutes: 43200,
},
},
FakeReaper: reaper,
ExpectedUnready: 1,
ExpectedOldReapable: 1,
ExpectedTerminated: 0,
ExpectedDrained: 0,
}
testCase.Run(t, false)
}

func TestReapOldSelfEviction(t *testing.T) {
reaper := newFakeReaperContext()

Expand Down Expand Up @@ -1291,6 +1318,60 @@ func TestDryRun(t *testing.T) {
testCase.Run(t, false)
}

func TestIgnoreFailure(t *testing.T) {
reaper := newFakeReaperContext()
reaper.AsgValidation = false
reaper.DryRun = false
reaper.IgnoreFailure = true
reaper.DrainTimeoutSeconds = 0

testCase := ReaperUnitTest{
TestDescription: "Ignore Failure - failed nodes should not be uncordoned",
InstanceGroup: FakeASG{
Name: "my-ig.cluster.k8s.local",
Healthy: 0,
Unhealthy: 2,
Desired: 3,
},
Nodes: []FakeNode{
{
nodeName: "ip-10-10-10-10.us-west-2.compute.local",
state: "NotReady",
ageMinutes: 43200,
},
{
nodeName: "ip-10-10-10-11.us-west-2.compute.local",
state: "Unknown",
ageMinutes: 43200,
},
{
state: "Unknown",
ageMinutes: 43200,
},
},
Events: []FakeEvent{
{
node: "ip-10-10-10-10.us-west-2.compute.local",
count: 300,
reason: "NodeDrainFailed",
kind: "Node",
},
{
node: "ip-10-10-10-11.us-west-2.compute.local",
count: 20,
reason: "NodeDrainFailed",
kind: "Node",
},
},
FakeReaper: reaper,
ExpectedUnready: 3,
ExpectedOldReapable: 3,
ExpectedDrainable: 0,
ExpectedDrained: 0,
}
testCase.Run(t, false)
}

func TestKillOldMasterMinMasters(t *testing.T) {
reaper := newFakeReaperContext()
reaper.MaxKill = 1
Expand Down
2 changes: 2 additions & 0 deletions pkg/reaper/nodereaper/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ type Args struct {
ReapTainted []string
ReconsiderUnreapableAfter float64
DrainTimeoutSeconds int64
IgnoreFailure bool
}

// ReaperContext holds the context of the node-reaper and target cluster
Expand Down Expand Up @@ -92,6 +93,7 @@ type ReaperContext struct {
ReapTainted []v1.Taint
ReconsiderUnreapableAfter float64
DrainTimeoutSeconds int64
IgnoreFailure bool
// runtime
UnreadyNodes []v1.Node
AllNodes []v1.Node
Expand Down