Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 88 additions & 2 deletions .github/workflows/on-pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,9 @@ jobs:
# - Ensure manifests work with the latest versions even with no manifest change
# (compared to helm charts, manifests cannot easily template changes based on versions)
# Helm charts are _trailing_ releases, while manifests are done during development.
e2e-manifests:
name: End-to-End test with kured with code and manifests from HEAD
# This test uses the "command" reboot-method.
e2e-manifests-command:
name: End-to-End test with kured with code and manifests from HEAD (command)
runs-on: ubuntu-latest
strategy:
fail-fast: false
Expand Down Expand Up @@ -181,6 +182,91 @@ jobs:
./tests/kind/follow-coordinated-reboot.sh


# This ensures the latest code works with the manifests built from tree.
# It is useful for two things:
# - Test manifests changes (obviously), ensuring they don't break existing clusters
# - Ensure manifests work with the latest versions even with no manifest change
# (compared to helm charts, manifests cannot easily template changes based on versions)
# Helm charts are _trailing_ releases, while manifests are done during development.
# This test uses the "signal" reboot-method.
e2e-manifests-signal:
name: End-to-End test with kured with code and manifests from HEAD (signal)
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
kubernetes:
- "1.26"
- "1.27"
- "1.28"
steps:
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Setup GoReleaser
run: make bootstrap-tools
- name: Find current tag version
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
id: tags
- name: Build artifacts
run: |
VERSION="${{ steps.tags.outputs.sha_short }}" make image
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest

- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
run: |
sudo bash << EOF
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
echo '{}' > /etc/docker/daemon.json
systemctl restart docker || journalctl --no-pager -n 500
systemctl status docker
EOF

# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create kind cluster with 5 nodes
uses: helm/kind-action@v1.8.0
with:
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
version: v0.14.0

- name: Preload previously built images onto kind cluster
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing

- name: Do not wait for an hour before detecting the rebootSentinel
run: |
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds-signal.yaml

- name: Install kured with kubectl
run: |
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds-signal.yaml

- name: Ensure kured is ready
uses: nick-invision/retry@v2.8.3
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 60
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"

- name: Create reboot sentinel files
run: |
./tests/kind/create-reboot-sentinels.sh

- name: Follow reboot until success
env:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh



# This ensures the latest code works with the manifests built from tree.
# It is useful for two things:
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ minikube-publish: image

manifest:
sed -i "s#image: ghcr.io/.*kured.*#image: ghcr.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds.yaml
sed -i "s#image: ghcr.io/.*kured.*#image: ghcr.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds-signal.yaml
echo "Please generate combined manifest if necessary"

test:
Expand Down
83 changes: 47 additions & 36 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ import (
"github.com/kubereboot/kured/pkg/alerts"
"github.com/kubereboot/kured/pkg/daemonsetlock"
"github.com/kubereboot/kured/pkg/delaytick"
"github.com/kubereboot/kured/pkg/reboot"
"github.com/kubereboot/kured/pkg/taints"
"github.com/kubereboot/kured/pkg/timewindow"
"github.com/kubereboot/kured/pkg/util"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
Expand All @@ -47,6 +49,7 @@ var (
drainDelay time.Duration
drainTimeout time.Duration
rebootDelay time.Duration
rebootMethod string
period time.Duration
metricsHost string
metricsPort int
Expand Down Expand Up @@ -74,6 +77,7 @@ var (
messageTemplateUncordon string
podSelectors []string
rebootCommand string
rebootSignal int
logFormat string
preRebootNodeLabels []string
postRebootNodeLabels []string
Expand Down Expand Up @@ -103,6 +107,13 @@ const (
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
// EnvPrefix The environment variable prefix of all environment variables bound to our command line flags.
EnvPrefix = "KURED"

// MethodCommand is used as "--reboot-method" value when rebooting with the configured "--reboot-command"
MethodCommand = "command"
// MethodSignal is used as "--reboot-method" value when rebooting with a SIGRTMIN+5 signal.
MethodSignal = "signal"

sigTrminPlus5 = 34 + 5

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this meant to be sigRtminPlus5 ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I fix the typo.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the typo is still there? But otherwise this should be finally good to go!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I didn't catch that. Let's fix the typo. If we're going to use camel case I'd vote for:

  • sigRTMin

Maybe:

const (
...
    sigRTMin = 34
    defaultSignal = sigRTMin + 5
...
)

And we should probably change the MethodSignal comment so that it doesn't imply that SIGRTMIN+5 is static (it's just a default, but the user can use whatever number within the min-max range that their OS allows). Maybe:

// MethodSignal is used as "--reboot-method" value when rebooting according to a signal in the allowed range between SIGRTMIN and SIGRTMAX as defined on the OS platform running kured

)

func init() {
Expand Down Expand Up @@ -146,6 +157,8 @@ func NewRootCommand() *cobra.Command {
"timeout after which the drain is aborted (default: 0, infinite time)")
rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0,
"delay reboot for this duration (default: 0, disabled)")
rootCmd.PersistentFlags().StringVar(&rebootMethod, "reboot-method", "command",
"method to use for reboots. Available: command")
rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60,
"sentinel check period")
rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system",
Expand Down Expand Up @@ -176,6 +189,8 @@ func NewRootCommand() *cobra.Command {
"command to run when a reboot is required")
rootCmd.PersistentFlags().IntVar(&concurrency, "concurrency", 1,
"amount of nodes to concurrently reboot. Defaults to 1")
rootCmd.PersistentFlags().IntVar(&rebootSignal, "reboot-signal", sigTrminPlus5,
"signal to use for reboot, SIGRTMIN+5 by default.")

rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
"slack hook URL for reboot notifications [deprecated in favor of --notify-url]")
Expand Down Expand Up @@ -299,22 +314,6 @@ func flagToEnvVar(flag string) string {
return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix)
}

// newCommand creates a new Command with stdout/stderr wired to our standard logger
func newCommand(name string, arg ...string) *exec.Cmd {
cmd := exec.Command(name, arg...)
cmd.Stdout = log.NewEntry(log.StandardLogger()).
WithField("cmd", cmd.Args[0]).
WithField("std", "out").
WriterLevel(log.InfoLevel)

cmd.Stderr = log.NewEntry(log.StandardLogger()).
WithField("cmd", cmd.Args[0]).
WithField("std", "err").
WriterLevel(log.WarnLevel)

return cmd
}

// buildHostCommand writes a new command to run in the host namespace
// Rancher based need different pid
func buildHostCommand(pid int, command []string) []string {
Expand All @@ -327,7 +326,7 @@ func buildHostCommand(pid int, command []string) []string {
}

func rebootRequired(sentinelCommand []string) bool {
cmd := newCommand(sentinelCommand[0], sentinelCommand[1:]...)
cmd := util.NewCommand(sentinelCommand[0], sentinelCommand[1:]...)
if err := cmd.Run(); err != nil {
switch err := err.(type) {
case *exec.ExitError:
Expand Down Expand Up @@ -557,20 +556,6 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error {
return nil
}

func invokeReboot(nodeID string, rebootCommand []string) {
log.Infof("Running command: %s for node: %s", rebootCommand, nodeID)

if notifyURL != "" {
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil {
log.Warnf("Error notifying: %v", err)
}
}

if err := newCommand(rebootCommand[0], rebootCommand[1:]...).Run(); err != nil {
log.Fatalf("Error invoking reboot command: %v", err)
}
}

func maintainRebootRequiredMetric(nodeID string, sentinelCommand []string) {
for {
if rebootRequired(sentinelCommand) {
Expand Down Expand Up @@ -661,7 +646,7 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri
}
}

func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []string, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) {
func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []string, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) {
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal(err)
Expand Down Expand Up @@ -805,7 +790,13 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
time.Sleep(rebootDelay)
}

invokeReboot(nodeID, rebootCommand)
if notifyURL != "" {
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil {
log.Warnf("Error notifying: %v", err)
}
}

booter.Reboot()
for {
log.Infof("Waiting for reboot")
time.Sleep(time.Minute)
Expand Down Expand Up @@ -872,18 +863,38 @@ func root(cmd *cobra.Command, args []string) {
log.Infof("Reboot schedule: %v", window)
log.Infof("Reboot check command: %s every %v", sentinelCommand, period)
log.Infof("Concurrency: %v", concurrency)
log.Infof("Reboot command: %s", restartCommand)
log.Infof("Reboot method: %s", rebootMethod)
if rebootCommand == MethodCommand {
log.Infof("Reboot command: %s", restartCommand)
} else {
log.Infof("Reboot signal: %v", rebootSignal)
}

if annotateNodes {
log.Infof("Will annotate nodes during kured reboot operations")
}

// To run those commands as it was the host, we'll use nsenter
// Relies on hostPID:true and privileged:true to enter host mount space
// PID set to 1, until we have a better discovery mechanism.
hostSentinelCommand := buildHostCommand(1, sentinelCommand)
hostRestartCommand := buildHostCommand(1, restartCommand)

go rebootAsRequired(nodeID, hostRestartCommand, hostSentinelCommand, window, lockTTL, lockReleaseDelay)
// Only wrap sentinel-command with nsenter, if a custom-command was configured, otherwise use the host-path mount
hostSentinelCommand := sentinelCommand
if rebootSentinelCommand != "" {
hostSentinelCommand = buildHostCommand(1, sentinelCommand)
}

var booter reboot.Reboot
if rebootMethod == MethodCommand {
booter = reboot.NewCommandReboot(nodeID, hostRestartCommand)
} else if rebootMethod == MethodSignal {
booter = reboot.NewSignalReboot(nodeID, rebootSignal)
} else {
log.Fatalf("Invalid reboot-method configured: %s", rebootMethod)
}

go rebootAsRequired(nodeID, booter, hostSentinelCommand, window, lockTTL, lockReleaseDelay)
go maintainRebootRequiredMetric(nodeID, hostSentinelCommand)

http.Handle("/metrics", promhttp.Handler())
Expand Down
100 changes: 100 additions & 0 deletions kured-ds-signal.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kured
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kured # Must match `--ds-name`
namespace: kube-system # Must match `--ds-namespace`
spec:
selector:
matchLabels:
name: kured
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: kured
spec:
serviceAccountName: kured
tolerations:
- key: node-role.kubernetes.io/control-plane
effect: NoSchedule
- key: node-role.kubernetes.io/master
effect: NoSchedule
hostPID: true # Facilitate entering the host mount namespace via init
restartPolicy: Always
volumes:
- name: sentinel
hostPath:
path: /var/run
type: Directory
containers:
- name: kured
# If you find yourself here wondering why there is no
# :latest tag on Docker Hub,see the FAQ in the README
image: ghcr.io/kubereboot/kured:1.13.2
imagePullPolicy: IfNotPresent
securityContext:
privileged: false # Give permission to nsenter /proc/1/ns/mnt
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["*"]
add: ["CAP_KILL"]
ports:
- containerPort: 8080
name: metrics
env:
# Pass in the name of the node on which this pod is scheduled
# for use with drain/uncordon operations and lock acquisition
- name: KURED_NODE_ID
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- mountPath: /sentinel
name: sentinel
readOnly: true
command:
- /usr/bin/kured
- --reboot-sentinel=/sentinel/reboot-required
- --reboot-method=signal
# - --reboot-signal=39
# - --force-reboot=false
# - --drain-grace-period=-1
# - --skip-wait-for-delete-timeout=0
# - --drain-timeout=0
# - --period=1h
# - --ds-namespace=kube-system
# - --ds-name=kured
# - --lock-annotation=weave.works/kured-node-lock
# - --lock-ttl=0
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --alert-filter-regexp=^RebootRequired$
# - --alert-firing-only=false
# - --prefer-no-schedule-taint=""
# - --reboot-sentinel-command=""
# - --slack-hook-url=https://hooks.slack.com/...
# - --slack-username=prod
# - --slack-channel=alerting
# - --notify-url="" # See also shoutrrr url format
# - --message-template-drain=Draining node %s
# - --message-template-reboot=Rebooting node %s
# - --message-template-uncordon=Node %s rebooted & uncordoned successfully!
# - --blocking-pod-selector=runtime=long,cost=expensive
# - --blocking-pod-selector=name=temperamental
# - --blocking-pod-selector=...
# - --reboot-days=sun,mon,tue,wed,thu,fri,sat
# - --reboot-delay=90s
# - --start-time=0:00
# - --end-time=23:59:59
# - --time-zone=UTC
# - --annotate-nodes=false
# - --lock-release-delay=30m
# - --log-format=text
Loading