Skip to content

Commit

Permalink
init.sh: install ip rules and routes with proto kernel
Browse files Browse the repository at this point in the history
In order to workaround systemd's bad recent changes where they decided
to manage "foreign" rules and to flush them on certain events (e.g.
device flap), we should add our rules as "proto kernel" so systemd will
just skip them and leave them in place in such events. Every modern
system with a new systemd running Cilium or other CNIs could experience
a full network outage (not only k8s NICs) because systemd would flush the
rule for local traffic lookup (from all lookup local) which causes full
network outage for the node[1].

[1] Normal rules with Cilium deployed look like:
 $ ip ru
 9:	from all fwmark 0x200/0xf00 lookup 2004
 10:	from all fwmark 0xa00/0xf00 lookup 2005
 100:	from all lookup local
 32766:	from all lookup main
 32767:	from all lookup default
After a network event we see systemd flushing all unspec rules (9, 10
and 100, the last one being critical):
 $ ip rule list
 32766:  from all lookup main
 32767:  from all lookup default

This leads to complete network outage for the node. With this change the
rules remain in place and everything continues working as expected.

Signed-off-by: Nikolay Aleksandrov <[email protected]>
  • Loading branch information
Nikolay Aleksandrov authored and youngnick committed Mar 25, 2023
1 parent c5403f1 commit d28e1aa
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 10 deletions.
21 changes: 11 additions & 10 deletions bpf/init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ NR_CPUS=${21}
ENDPOINT_ROUTES=${22}
PROXY_RULE=${23}
FILTER_PRIO=${24}
DEFAULT_RTPROTO=${25}

ID_HOST=1
ID_WORLD=2
Expand Down Expand Up @@ -80,14 +81,14 @@ function move_local_rules_af()
# otherwise local addresses will not be reachable for a short period of
# time.
$IP rule list | grep 100 | grep "lookup local" || {
$IP rule add from all lookup local pref 100
$IP rule add from all lookup local pref 100 proto $DEFAULT_RTPROTO
}
$IP rule del from all lookup local pref 0 2> /dev/null || true

# check if the move of the local table move was successful and restore
# it otherwise
if [ "$($IP rule list | grep "lookup local" | wc -l)" -eq "0" ]; then
$IP rule add from all lookup local pref 0
$IP rule add from all lookup local pref 0 proto $DEFAULT_RTPROTO
$IP rule del from all lookup local pref 100
echo "Error: The kernel does not support moving the local table routing rule"
echo "Local routing rules:"
Expand All @@ -111,13 +112,13 @@ function setup_proxy_rules()
{
# Any packet from an ingress proxy uses a separate routing table that routes
# the packet back to the cilium host device.
from_ingress_rulespec="fwmark 0xA00/0xF00 pref 10 lookup $PROXY_RT_TABLE"
from_ingress_rulespec="fwmark 0xA00/0xF00 pref 10 lookup $PROXY_RT_TABLE proto $DEFAULT_RTPROTO"

# Any packet to an ingress or egress proxy uses a separate routing table
# that routes the packet to the loopback device regardless of the destination
# address in the packet. For this to work the ctx must have a socket set
# (e.g., via TPROXY).
to_proxy_rulespec="fwmark 0x200/0xF00 pref 9 lookup $TO_PROXY_RT_TABLE"
to_proxy_rulespec="fwmark 0x200/0xF00 pref 9 lookup $TO_PROXY_RT_TABLE proto $DEFAULT_RTPROTO"

if [ "$IP4_HOST" != "<nil>" ]; then
if [ -n "$(ip -4 rule list)" ]; then
Expand All @@ -136,14 +137,14 @@ function setup_proxy_rules()
fi

# Traffic to the host proxy is local
ip route replace table $TO_PROXY_RT_TABLE local 0.0.0.0/0 dev lo
ip route replace table $TO_PROXY_RT_TABLE local 0.0.0.0/0 dev lo proto $DEFAULT_RTPROTO
# Traffic from ingress proxy goes to Cilium address space via the cilium host device
if [ "$ENDPOINT_ROUTES" = "true" ]; then
ip route delete table $PROXY_RT_TABLE $IP4_HOST/32 dev $HOST_DEV1 2>/dev/null || true
ip route delete table $PROXY_RT_TABLE default via $IP4_HOST 2>/dev/null || true
else
ip route replace table $PROXY_RT_TABLE $IP4_HOST/32 dev $HOST_DEV1
ip route replace table $PROXY_RT_TABLE default via $IP4_HOST
ip route replace table $PROXY_RT_TABLE $IP4_HOST/32 dev $HOST_DEV1 proto $DEFAULT_RTPROTO
ip route replace table $PROXY_RT_TABLE default via $IP4_HOST proto $DEFAULT_RTPROTO
fi
else
ip -4 rule del $to_proxy_rulespec 2> /dev/null || true
Expand All @@ -169,14 +170,14 @@ function setup_proxy_rules()
IP6_LLADDR=$(ip -6 addr show dev $HOST_DEV2 | grep inet6 | head -1 | awk '{print $2}' | awk -F'/' '{print $1}')
if [ -n "$IP6_LLADDR" ]; then
# Traffic to the host proxy is local
ip -6 route replace table $TO_PROXY_RT_TABLE local ::/0 dev lo
ip -6 route replace table $TO_PROXY_RT_TABLE local ::/0 dev lo proto $DEFAULT_RTPROTO
# Traffic from ingress proxy goes to Cilium address space via the cilium host device
if [ "$ENDPOINT_ROUTES" = "true" ]; then
ip -6 route delete table $PROXY_RT_TABLE ${IP6_LLADDR}/128 dev $HOST_DEV1 2>/dev/null || true
ip -6 route delete table $PROXY_RT_TABLE default via $IP6_LLADDR dev $HOST_DEV1 2>/dev/null || true
else
ip -6 route replace table $PROXY_RT_TABLE ${IP6_LLADDR}/128 dev $HOST_DEV1
ip -6 route replace table $PROXY_RT_TABLE default via $IP6_LLADDR dev $HOST_DEV1
ip -6 route replace table $PROXY_RT_TABLE ${IP6_LLADDR}/128 dev $HOST_DEV1 proto $DEFAULT_RTPROTO
ip -6 route replace table $PROXY_RT_TABLE default via $IP6_LLADDR dev $HOST_DEV1 proto $DEFAULT_RTPROTO
fi
fi
else
Expand Down
2 changes: 2 additions & 0 deletions pkg/datapath/loader/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ const (
initArgEndpointRoutes
initArgProxyRule
initTCFilterPriority
initDefaultRTProto
initArgMax
)

Expand Down Expand Up @@ -403,6 +404,7 @@ func (l *Loader) Reinitialize(ctx context.Context, o datapath.BaseProgramOwner,
}

args[initTCFilterPriority] = strconv.Itoa(int(option.Config.TCFilterPriority))
args[initDefaultRTProto] = strconv.Itoa(linux_defaults.RTProto)

// "Legacy" datapath inizialization with the init.sh script
// TODO(mrostecki): Rewrite the whole init.sh in Go, step by step.
Expand Down

0 comments on commit d28e1aa

Please sign in to comment.