Skip to content

Commit

Permalink
fix to execute arbitrary command when retirement
Browse files Browse the repository at this point in the history
  • Loading branch information
YZ775 committed Oct 23, 2024
1 parent 31e4508 commit 421ea18
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 79 deletions.
9 changes: 3 additions & 6 deletions cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,14 +321,11 @@ const DefaultRepairHealthCheckCommandTimeoutSeconds = 30
const DefaultRepairCommandTimeoutSeconds = 30

type Retire struct {
ShutdownCommand []string `json:"shutdown_command"`
CheckCommand []string `json:"check_command"`
CommandTimeoutSeconds *int `json:"command_timeout_seconds,omitempty"`
CheckTimeoutSeconds *int `json:"check_timeout_seconds,omitempty"`
OptionalCommand []string `json:"optional_command,omitempty"`
OptionalCommandTimeoutSeconds *int `json:"optional_command_timeout_seconds,omitempty"`
}

const DefaultRetireCommandTimeoutSeconds = 30
const DefaultRetireCheckTimeoutSeconds = 300
const DefaultRetireOptionalCommandTimeoutSeconds = 30

// Options is a set of optional parameters for k8s components.
type Options struct {
Expand Down
6 changes: 2 additions & 4 deletions mtest/cke-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,8 @@ repair:
watch_seconds: 30
health_check_command: ["sh", "-c", "test -f /tmp/mtest-repair-$1 && echo true", "health_check"]
retire:
shutdown_command: ["true"]
check_command: ["bash", "-c", "echo 'Off'"]
command_timeout_seconds: 30
check_timeout_seconds: 300
optional_command: ["true"]
optional_command_timeout_seconds: 30
options:
kube-api:
extra_binds:
Expand Down
92 changes: 23 additions & 69 deletions op/kube_node_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"time"

"github.com/cybozu-go/cke"
"github.com/cybozu-go/log"
"github.com/cybozu-go/well"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -41,10 +40,8 @@ func (o *kubeNodeRemove) NextCommand() cke.Commander {
return nodeRemoveCommand{
o.apiserver,
o.nodes,
o.config.ShutdownCommand,
o.config.CheckCommand,
o.config.CommandTimeoutSeconds,
o.config.CheckTimeoutSeconds,
o.config.OptionalCommand,
o.config.OptionalCommandTimeoutSeconds,
}
}

Expand All @@ -55,12 +52,10 @@ func (o *kubeNodeRemove) Targets() []string {
}

type nodeRemoveCommand struct {
apiserver *cke.Node
nodes []*corev1.Node
shutdownCommand []string
checkCommand []string
timeoutSeconds *int
checkTimeoutSeconds *int
apiserver *cke.Node
nodes []*corev1.Node
optionalCommand []string
optionalCommandTimeoutSeconds *int
}

func (c nodeRemoveCommand) Run(ctx context.Context, inf cke.Infrastructure, _ string) error {
Expand Down Expand Up @@ -92,66 +87,25 @@ func (c nodeRemoveCommand) Run(ctx context.Context, inf cke.Infrastructure, _ st
return fmt.Errorf("failed to patch node %s: %v", n.Name, err)
}
}
err := func() error {
ctx := ctx
timeout := cke.DefaultRetireCommandTimeoutSeconds
if c.timeoutSeconds != nil {
timeout = *c.timeoutSeconds
}
if timeout != 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Second*time.Duration(timeout))
defer cancel()
}
args := append(c.shutdownCommand[1:], n.Name)
command := well.CommandContext(ctx, c.shutdownCommand[0], args...)
return command.Run()
}()
if err != nil {
return fmt.Errorf("failed to shutdown node %s: %v", n.Name, err)
}

err = func() error {
ctx := ctx
checkTimeout := cke.DefaultRetireCheckTimeoutSeconds
if c.checkTimeoutSeconds != nil {
checkTimeout = *c.checkTimeoutSeconds
}
timeout := time.After(time.Duration(checkTimeout) * time.Second)
ticker := time.NewTicker(10 * time.Second)
for {
select {
case <-timeout:
return fmt.Errorf("timeout")
case <-ticker.C:
args := append(c.checkCommand[1:], n.Name)
command := well.CommandContext(ctx, c.checkCommand[0], args...)
stdout, err := command.Output()
if err != nil {
log.Warn("failed to check shutdown status of node", map[string]interface{}{
log.FnError: err,
"node": n.Name,
})
continue
}
if strings.TrimSuffix(string(stdout), "\n") == "Off" {
return nil
}
if len(c.optionalCommand) != 0 {
err := func() error {
ctx := ctx
timeout := cke.DefaultRetireOptionalCommandTimeoutSeconds
if c.optionalCommandTimeoutSeconds != nil {
timeout = *c.optionalCommandTimeoutSeconds
}
if timeout != 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Second*time.Duration(timeout))
defer cancel()
}
args := append(c.optionalCommand[1:], n.Name)
command := well.CommandContext(ctx, c.optionalCommand[0], args...)
return command.Run()
}()
if err != nil {
return fmt.Errorf("failed to execute optional command in retirement %s: %v", n.Name, err)
}
}()
if err != nil {
return fmt.Errorf("failed to check shutdown status of node %s: %v", n.Name, err)
}
shutdownTaint := corev1.Taint{
Key: "node.kubernetes.io/out-of-service",
Value: "nodeshutdown",
Effect: corev1.TaintEffectNoExecute,
}
n.Spec.Taints = append(n.Spec.Taints, shutdownTaint)
_, err = nodesAPI.Update(ctx, n, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("failed to update node %s: %v", n.Name, err)
}

err = nodesAPI.Delete(ctx, n.Name, metav1.DeleteOptions{})
Expand Down

0 comments on commit 421ea18

Please sign in to comment.