From d28e1aa517fbdd67951f75062d4a5098174a754d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 10 Mar 2023 12:52:33 +0200 Subject: [PATCH] init.sh: install ip rules and routes with proto kernel In order to workaround systemd's bad recent changes where they decided to manage "foreign" rules and to flush them on certain events (e.g. device flap), we should add our rules as "proto kernel" so systemd will just skip them and leave them in place in such events. Every modern system with a new systemd running Cilium or other CNIs could experience a full network outage (not only k8s NICs) because systemd would flush the rule for local traffic lookup (from all lookup local) which causes full network outage for the node[1]. [1] Normal rules with Cilium deployed look like: $ ip ru 9: from all fwmark 0x200/0xf00 lookup 2004 10: from all fwmark 0xa00/0xf00 lookup 2005 100: from all lookup local 32766: from all lookup main 32767: from all lookup default After a network event we see systemd flushing all unspec rules (9, 10 and 100, the last one being critical): $ ip rule list 32766: from all lookup main 32767: from all lookup default This leads to complete network outage for the node. With this change the rules remain in place and everything continues working as expected. Signed-off-by: Nikolay Aleksandrov --- bpf/init.sh | 21 +++++++++++---------- pkg/datapath/loader/base.go | 2 ++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/bpf/init.sh b/bpf/init.sh index a1e3a4afa6c53..dab18d5f26819 100755 --- a/bpf/init.sh +++ b/bpf/init.sh @@ -28,6 +28,7 @@ NR_CPUS=${21} ENDPOINT_ROUTES=${22} PROXY_RULE=${23} FILTER_PRIO=${24} +DEFAULT_RTPROTO=${25} ID_HOST=1 ID_WORLD=2 @@ -80,14 +81,14 @@ function move_local_rules_af() # otherwise local addresses will not be reachable for a short period of # time. $IP rule list | grep 100 | grep "lookup local" || { - $IP rule add from all lookup local pref 100 + $IP rule add from all lookup local pref 100 proto $DEFAULT_RTPROTO } $IP rule del from all lookup local pref 0 2> /dev/null || true # check if the move of the local table move was successful and restore # it otherwise if [ "$($IP rule list | grep "lookup local" | wc -l)" -eq "0" ]; then - $IP rule add from all lookup local pref 0 + $IP rule add from all lookup local pref 0 proto $DEFAULT_RTPROTO $IP rule del from all lookup local pref 100 echo "Error: The kernel does not support moving the local table routing rule" echo "Local routing rules:" @@ -111,13 +112,13 @@ function setup_proxy_rules() { # Any packet from an ingress proxy uses a separate routing table that routes # the packet back to the cilium host device. - from_ingress_rulespec="fwmark 0xA00/0xF00 pref 10 lookup $PROXY_RT_TABLE" + from_ingress_rulespec="fwmark 0xA00/0xF00 pref 10 lookup $PROXY_RT_TABLE proto $DEFAULT_RTPROTO" # Any packet to an ingress or egress proxy uses a separate routing table # that routes the packet to the loopback device regardless of the destination # address in the packet. For this to work the ctx must have a socket set # (e.g., via TPROXY). - to_proxy_rulespec="fwmark 0x200/0xF00 pref 9 lookup $TO_PROXY_RT_TABLE" + to_proxy_rulespec="fwmark 0x200/0xF00 pref 9 lookup $TO_PROXY_RT_TABLE proto $DEFAULT_RTPROTO" if [ "$IP4_HOST" != "" ]; then if [ -n "$(ip -4 rule list)" ]; then @@ -136,14 +137,14 @@ function setup_proxy_rules() fi # Traffic to the host proxy is local - ip route replace table $TO_PROXY_RT_TABLE local 0.0.0.0/0 dev lo + ip route replace table $TO_PROXY_RT_TABLE local 0.0.0.0/0 dev lo proto $DEFAULT_RTPROTO # Traffic from ingress proxy goes to Cilium address space via the cilium host device if [ "$ENDPOINT_ROUTES" = "true" ]; then ip route delete table $PROXY_RT_TABLE $IP4_HOST/32 dev $HOST_DEV1 2>/dev/null || true ip route delete table $PROXY_RT_TABLE default via $IP4_HOST 2>/dev/null || true else - ip route replace table $PROXY_RT_TABLE $IP4_HOST/32 dev $HOST_DEV1 - ip route replace table $PROXY_RT_TABLE default via $IP4_HOST + ip route replace table $PROXY_RT_TABLE $IP4_HOST/32 dev $HOST_DEV1 proto $DEFAULT_RTPROTO + ip route replace table $PROXY_RT_TABLE default via $IP4_HOST proto $DEFAULT_RTPROTO fi else ip -4 rule del $to_proxy_rulespec 2> /dev/null || true @@ -169,14 +170,14 @@ function setup_proxy_rules() IP6_LLADDR=$(ip -6 addr show dev $HOST_DEV2 | grep inet6 | head -1 | awk '{print $2}' | awk -F'/' '{print $1}') if [ -n "$IP6_LLADDR" ]; then # Traffic to the host proxy is local - ip -6 route replace table $TO_PROXY_RT_TABLE local ::/0 dev lo + ip -6 route replace table $TO_PROXY_RT_TABLE local ::/0 dev lo proto $DEFAULT_RTPROTO # Traffic from ingress proxy goes to Cilium address space via the cilium host device if [ "$ENDPOINT_ROUTES" = "true" ]; then ip -6 route delete table $PROXY_RT_TABLE ${IP6_LLADDR}/128 dev $HOST_DEV1 2>/dev/null || true ip -6 route delete table $PROXY_RT_TABLE default via $IP6_LLADDR dev $HOST_DEV1 2>/dev/null || true else - ip -6 route replace table $PROXY_RT_TABLE ${IP6_LLADDR}/128 dev $HOST_DEV1 - ip -6 route replace table $PROXY_RT_TABLE default via $IP6_LLADDR dev $HOST_DEV1 + ip -6 route replace table $PROXY_RT_TABLE ${IP6_LLADDR}/128 dev $HOST_DEV1 proto $DEFAULT_RTPROTO + ip -6 route replace table $PROXY_RT_TABLE default via $IP6_LLADDR dev $HOST_DEV1 proto $DEFAULT_RTPROTO fi fi else diff --git a/pkg/datapath/loader/base.go b/pkg/datapath/loader/base.go index cbeefe8970fee..1e9e028965888 100644 --- a/pkg/datapath/loader/base.go +++ b/pkg/datapath/loader/base.go @@ -61,6 +61,7 @@ const ( initArgEndpointRoutes initArgProxyRule initTCFilterPriority + initDefaultRTProto initArgMax ) @@ -403,6 +404,7 @@ func (l *Loader) Reinitialize(ctx context.Context, o datapath.BaseProgramOwner, } args[initTCFilterPriority] = strconv.Itoa(int(option.Config.TCFilterPriority)) + args[initDefaultRTProto] = strconv.Itoa(linux_defaults.RTProto) // "Legacy" datapath inizialization with the init.sh script // TODO(mrostecki): Rewrite the whole init.sh in Go, step by step.