From d1698a5046a81de6becccb3b973f5869f2262601 Mon Sep 17 00:00:00 2001 From: Ben Nemec Date: Fri, 28 Aug 2020 15:13:43 -0500 Subject: [PATCH] Add support for stopping and starting keepalived There are circumstances where keepalived can cause issues with the networking on a node, notably when bridging a physical interface. After the address has been moved to the bridge, it is possible for old routes to exist that cause problems talking to other nodes, which breaks the apiserver and prevents us from updating the keepalived config to reflect the networking change. This leaves us in a situation where the code can't recover properly from the bad configuration. In short, the apiserver is waiting for keepalived to update its configuration, but keepalived needs the apiserver in order to do so. This change addresses the problem by stopping keepalived if the monitor fails to update the config more than 3 times in a row. That will unconfigure any VIPs on the node, which should fix the error described above. Once the bad routes related to the VIP(s) are gone, the apiserver will recover and we'll be able to update the keepalived config again. After that happens, keepalived is restarted. This is one half of the fix. The other half will be in baremetal-runtimecfg to call the control socket with stop and start commands as appropriate. --- manifests/on-prem/keepalived.yaml | 47 +++++++++++++------ pkg/operator/assets/bindata.go | 47 +++++++++++++------ .../common/on-prem/files/keepalived.yaml | 47 ++++++++++++++++--- 3 files changed, 107 insertions(+), 34 deletions(-) diff --git a/manifests/on-prem/keepalived.yaml b/manifests/on-prem/keepalived.yaml index f663e9f4ca..555a5d0faa 100644 --- a/manifests/on-prem/keepalived.yaml +++ b/manifests/on-prem/keepalived.yaml @@ -42,42 +42,61 @@ spec: if pid=$(pgrep -o keepalived); then kill -s SIGHUP "$pid" else - /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & + start_keepalived fi } + stop_keepalived() { - echo "Keepalived process stopped" >> /var/run/keepalived/stopped if pid=$(pgrep -o keepalived); then - kill -s TERM "$pid" + kill -s SIGTERM "$pid" + # The monitor runs every 10 seconds + sleep 9 + if pid=$(pgrep -o keepalived); then + kill -s SIGKILL "$pid" + fi + touch /var/run/keepalived/stopped + fi + } + + start_keepalived() + { + if ! pgrep -o keepalived > /dev/null; then + /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & fi + rm -f /var/run/keepalived/stopped } msg_handler() { while read -r line; do + # These get sent a lot, don't spam the logs with them + if [ "$line" = stop ]; then + stop_keepalived + continue + elif [ "$line" = start ]; then + start_keepalived + continue + fi echo "The client sent: $line" >&2 - # currently only 'reload' and 'stop' msgs are supported if [ "$line" = reload ]; then reload_keepalived - elif [ "$line" = stop ]; then - stop_keepalived + else + echo "Unrecognized command: $line" >&2 fi done } + set -ex declare -r keepalived_sock="/var/run/keepalived/keepalived.sock" export -f msg_handler export -f reload_keepalived export -f stop_keepalived - - while [ -s "/var/run/keepalived/stopped" ]; do - echo "Container stopped" - sleep 60 - done - if [ -s "/etc/keepalived/keepalived.conf" ]; then - /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & + export -f start_keepalived + if [ -s "/etc/keepalived/keepalived.conf" -a ! -e /var/run/keepalived/stopped ]; then + start_keepalived fi + rm -f "$keepalived_sock" socat UNIX-LISTEN:${keepalived_sock},fork system:'bash -c msg_handler' resources: @@ -96,7 +115,7 @@ spec: - -c - | [[ -s /etc/keepalived/keepalived.conf ]] || \ - [[ -s /var/run/keepalived/stopped ]] || \ + [[ -e /var/run/keepalived/stopped ]] || \ kill -s SIGUSR1 "$(pgrep -o keepalived)" && ! grep -q "State = FAULT" /tmp/keepalived.data initialDelaySeconds: 20 terminationMessagePolicy: FallbackToLogsOnError diff --git a/pkg/operator/assets/bindata.go b/pkg/operator/assets/bindata.go index b726ef2e73..c186becadd 100644 --- a/pkg/operator/assets/bindata.go +++ b/pkg/operator/assets/bindata.go @@ -1858,42 +1858,61 @@ spec: if pid=$(pgrep -o keepalived); then kill -s SIGHUP "$pid" else - /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & + start_keepalived fi } + stop_keepalived() { - echo "Keepalived process stopped" >> /var/run/keepalived/stopped if pid=$(pgrep -o keepalived); then - kill -s TERM "$pid" + kill -s SIGTERM "$pid" + # The monitor runs every 10 seconds + sleep 9 + if pid=$(pgrep -o keepalived); then + kill -s SIGKILL "$pid" + fi + touch /var/run/keepalived/stopped + fi + } + + start_keepalived() + { + if ! pgrep -o keepalived > /dev/null; then + /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & fi + rm -f /var/run/keepalived/stopped } msg_handler() { while read -r line; do + # These get sent a lot, don't spam the logs with them + if [ "$line" = stop ]; then + stop_keepalived + continue + elif [ "$line" = start ]; then + start_keepalived + continue + fi echo "The client sent: $line" >&2 - # currently only 'reload' and 'stop' msgs are supported if [ "$line" = reload ]; then reload_keepalived - elif [ "$line" = stop ]; then - stop_keepalived + else + echo "Unrecognized command: $line" >&2 fi done } + set -ex declare -r keepalived_sock="/var/run/keepalived/keepalived.sock" export -f msg_handler export -f reload_keepalived export -f stop_keepalived - - while [ -s "/var/run/keepalived/stopped" ]; do - echo "Container stopped" - sleep 60 - done - if [ -s "/etc/keepalived/keepalived.conf" ]; then - /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & + export -f start_keepalived + if [ -s "/etc/keepalived/keepalived.conf" -a ! -e /var/run/keepalived/stopped ]; then + start_keepalived fi + rm -f "$keepalived_sock" socat UNIX-LISTEN:${keepalived_sock},fork system:'bash -c msg_handler' resources: @@ -1912,7 +1931,7 @@ spec: - -c - | [[ -s /etc/keepalived/keepalived.conf ]] || \ - [[ -s /var/run/keepalived/stopped ]] || \ + [[ -e /var/run/keepalived/stopped ]] || \ kill -s SIGUSR1 "$(pgrep -o keepalived)" && ! grep -q "State = FAULT" /tmp/keepalived.data initialDelaySeconds: 20 terminationMessagePolicy: FallbackToLogsOnError diff --git a/templates/common/on-prem/files/keepalived.yaml b/templates/common/on-prem/files/keepalived.yaml index cf178bcc06..f57a3e421a 100644 --- a/templates/common/on-prem/files/keepalived.yaml +++ b/templates/common/on-prem/files/keepalived.yaml @@ -82,17 +82,49 @@ contents: if pid=$(pgrep -o keepalived); then kill -s SIGHUP "$pid" else - /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & + start_keepalived fi } + stop_keepalived() + { + if pid=$(pgrep -o keepalived); then + kill -s SIGTERM "$pid" + # Immediately create the stopped flag. Otherwise, if the liveness probe + # happens to fire during the sleep below it might incorrectly fail. + touch /var/run/keepalived/stopped + # The monitor runs every 10 seconds + sleep 9 + if pid=$(pgrep -o keepalived); then + kill -s SIGKILL "$pid" + fi + fi + } + + start_keepalived() + { + if ! pgrep -o keepalived > /dev/null; then + /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & + fi + rm -f /var/run/keepalived/stopped + } + msg_handler() { while read -r line; do + # These get sent a lot, don't spam the logs with them + if [ "$line" = stop ]; then + stop_keepalived + continue + elif [ "$line" = start ]; then + start_keepalived + continue + fi echo "The client sent: $line" >&2 - # currently only 'reload' msg is supported if [ "$line" = reload ]; then reload_keepalived + else + echo "Unrecognized command: $line" >&2 fi done } @@ -102,10 +134,13 @@ contents: export -f msg_handler export -f reload_keepalived export -f sigterm_handler + export -f stop_keepalived + export -f start_keepalived trap sigterm_handler SIGTERM - if [ -s "/etc/keepalived/keepalived.conf" ]; then - /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf --dont-fork --vrrp --log-detail --log-console & + + if [ -s "/etc/keepalived/keepalived.conf" -a ! -e /var/run/keepalived/stopped ]; then + start_keepalived fi rm -f "$keepalived_sock" @@ -125,7 +160,7 @@ contents: - /bin/bash - -c - | - echo "State = FAULT" > /tmp/keepalived.data && kill -s SIGUSR1 "$(pgrep -o keepalived)" && for i in $(seq 5); do grep -q "State = FAULT" /tmp/keepalived.data && sleep 1 || exit 0; done && exit 1 + [ -e /var/run/keepalived/stopped ] || (echo "State = FAULT" > /tmp/keepalived.data && kill -s SIGUSR1 "$(pgrep -o keepalived)" && for i in $(seq 5); do grep -q "State = FAULT" /tmp/keepalived.data && sleep 1 || exit 0; done && exit 1) initialDelaySeconds: 20 terminationMessagePolicy: FallbackToLogsOnError imagePullPolicy: IfNotPresent @@ -168,4 +203,4 @@ contents: - operator: Exists priorityClassName: system-node-critical status: {} - {{ end -}} \ No newline at end of file + {{ end -}}