diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d9d5d40eec..b9341080e0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -483,14 +483,15 @@ jobs: - {"target": "network-segmentation", "ha": "noHA", "gateway-mode": "shared", "ipfamily": "ipv6", "disable-snat-multiple-gws": "noSnatGW", "second-bridge": "1br", "ic": "ic-single-node-zones"} - {"target": "bgp", "ha": "noHA", "gateway-mode": "local", "ipfamily": "dualstack", "disable-snat-multiple-gws": "snatGW", "second-bridge": "1br", "ic": "ic-single-node-zones", "routeadvertisements": "advertise-default", "network-segmentation": "enable-network-segmentation"} - {"target": "bgp", "ha": "noHA", "gateway-mode": "shared", "ipfamily": "dualstack", "disable-snat-multiple-gws": "noSnatGW", "second-bridge": "1br", "ic": "ic-single-node-zones", "routeadvertisements": "advertise-default", "network-segmentation": "enable-network-segmentation"} + - {"target": "bgp-loose-isolation", "ha": "noHA", "gateway-mode": "shared", "ipfamily": "dualstack", "disable-snat-multiple-gws": "snatGW", "second-bridge": "1br", "ic": "ic-single-node-zones", "routeadvertisements": "advertise-default", "network-segmentation": "enable-network-segmentation", "advertised-udn-isolation-mode": "loose"} - {"target": "traffic-flow-test-only","ha": "noHA", "gateway-mode": "shared", "ipfamily": "ipv4", "disable-snat-multiple-gws": "noSnatGW", "second-bridge": "1br", "ic": "ic-single-node-zones", "traffic-flow-tests": "1-24", "network-segmentation": "enable-network-segmentation"} - {"target": "tools", "ha": "noHA", "gateway-mode": "local", "ipfamily": "dualstack", "disable-snat-multiple-gws": "SnatGW", "second-bridge": "1br", "ic": "ic-single-node-zones", "network-segmentation": "enable-network-segmentation"} needs: [ build-pr ] env: JOB_NAME: "${{ matrix.target }}-${{ matrix.ha }}-${{ matrix.gateway-mode }}-${{ matrix.ipfamily }}-${{ matrix.disable-snat-multiple-gws }}-${{ matrix.second-bridge }}-${{ matrix.ic }}" OVN_HYBRID_OVERLAY_ENABLE: ${{ (matrix.target == 'control-plane' || matrix.target == 'control-plane-helm') && (matrix.ipfamily == 'ipv4' || matrix.ipfamily == 'dualstack' ) }} - OVN_MULTICAST_ENABLE: "${{ matrix.target == 'control-plane' || matrix.target == 'control-plane-helm' || matrix.target == 'network-segmentation' || matrix.target == 'bgp' }}" - OVN_EMPTY_LB_EVENTS: "${{ matrix.target == 'control-plane' || matrix.target == 'control-plane-helm' || matrix.target == 'bgp' }}" + OVN_MULTICAST_ENABLE: "${{ matrix.target == 'control-plane' || matrix.target == 'control-plane-helm' || matrix.target == 'network-segmentation' || matrix.target == 'bgp' || matrix.target == 'bgp-loose-isolation' }}" + OVN_EMPTY_LB_EVENTS: "${{ matrix.target == 'control-plane' || matrix.target == 'control-plane-helm' || matrix.target == 'bgp' || matrix.target == 'bgp-loose-isolation' }}" OVN_HA: "${{ matrix.ha == 'HA' }}" OVN_DISABLE_SNAT_MULTIPLE_GWS: "${{ matrix.disable-snat-multiple-gws == 'noSnatGW' }}" KIND_INSTALL_METALLB: "${{ matrix.target == 'control-plane' || matrix.target == 'control-plane-helm' || matrix.target == 'network-segmentation' }}" @@ -513,6 +514,7 @@ jobs: TRAFFIC_FLOW_TESTS: "${{ matrix.traffic-flow-tests }}" ENABLE_ROUTE_ADVERTISEMENTS: "${{ matrix.routeadvertisements != '' }}" ADVERTISE_DEFAULT_NETWORK: "${{ matrix.routeadvertisements == 'advertise-default' }}" + ADVERTISED_UDN_ISOLATION_MODE: "${{ matrix.advertised-udn-isolation-mode }}" steps: - name: Install VRF kernel module @@ -646,7 +648,7 @@ jobs: # set 3 hours for control-plane tests as these might take a while # give 10m extra to give ginkgo chance to timeout before github so that we # get its output - timeout-minutes: ${{ matrix.target == 'bgp' && 190 || matrix.target == 'control-plane' && 190 || matrix.target == 'control-plane-helm' && 190 || matrix.target == 'external-gateway' && 190 || 130 }} + timeout-minutes: ${{ matrix.target == 'bgp-loose-isolation' && 190 || matrix.target == 'bgp' && 190 || matrix.target == 'control-plane' && 190 || matrix.target == 'control-plane-helm' && 190 || matrix.target == 'external-gateway' && 190 || 130 }} run: | # used by e2e diagnostics package export OVN_IMAGE="ovn-daemonset-fedora:pr" @@ -670,7 +672,7 @@ jobs: fi elif [ "${{ matrix.target }}" == "network-segmentation" ]; then make -C test control-plane WHAT="Network Segmentation" - elif [ "${{ matrix.target }}" == "bgp" ]; then + elif [ "${{ matrix.target }}" == "bgp" ] || [ "${{ matrix.target }}" == "bgp-loose-isolation" ]; then make -C test control-plane elif [ "${{ matrix.target }}" == "tools" ]; then make -C go-controller build diff --git a/contrib/kind-common b/contrib/kind-common index 2a564dece0..c11265d259 100644 --- a/contrib/kind-common +++ b/contrib/kind-common @@ -719,22 +719,52 @@ deploy_bgp_external_server() { $OCI_BIN run --cap-add NET_ADMIN --user 0 -d --network bgpnet --rm --name bgpserver -p 8080:8080 registry.k8s.io/e2e-test-images/agnhost:2.45 netexec # let's make the bgp external server have its default route towards FRR router so that we don't need to add routes during tests back to the pods in the # cluster for return traffic - local bgp_network_frr_v4 bgp_network_frr_v6 + local bgp_network_frr_v4 bgp_network_frr_v6 kind_network_frr_v4 kind_network_frr_v6 bgp_network_frr_v4=$($OCI_BIN inspect -f '{{.NetworkSettings.Networks.bgpnet.IPAddress}}' frr) - echo "FRR kind network IPv4: ${bgp_network_frr_v4}" + echo "FRR bgp network IPv4: ${bgp_network_frr_v4}" $OCI_BIN exec bgpserver ip route replace default via "$bgp_network_frr_v4" if [ "$PLATFORM_IPV6_SUPPORT" == true ] ; then bgp_network_frr_v6=$($OCI_BIN inspect -f '{{.NetworkSettings.Networks.bgpnet.GlobalIPv6Address}}' frr) - echo "FRR kind network IPv6: ${bgp_network_frr_v6}" + echo "FRR bgp network IPv6: ${bgp_network_frr_v6}" $OCI_BIN exec bgpserver ip -6 route replace default via "$bgp_network_frr_v6" fi - # disable the default route to make sure the container only routes accross - # directly connected or learnt networks (doing this at the very end since - # docker changes the routing table when a new network is connected) - $OCI_BIN exec frr ip route delete default - $OCI_BIN exec frr ip route - $OCI_BIN exec frr ip -6 route delete default - $OCI_BIN exec frr ip -6 route + if [ "$ADVERTISED_UDN_ISOLATION_MODE" == "loose" ]; then + kind_network_frr_v4=$($OCI_BIN inspect -f '{{.NetworkSettings.Networks.kind.IPAddress}}' frr) + echo "FRR kind network IPv4: ${kind_network_frr_v4}" + # If UDN isolation is in loose disabled, we need to set the default gateway for the nodes in the cluster + # to the FRR router so that cross-UDN traffic can be routed back to the pods in the cluster in the loose mode. + echo "Setting default gateway for nodes in the cluster to FRR router IPv4: ${kind_network_frr_v4}" + set_nodes_default_gw "$kind_network_frr_v4" + if [ "$PLATFORM_IPV6_SUPPORT" == true ] ; then + kind_network_frr_v6=$($OCI_BIN inspect -f '{{.NetworkSettings.Networks.kind.GlobalIPv6Address}}' frr) + echo "FRR kind network IPv6: ${kind_network_frr_v6}" + set_nodes_default_gw "$kind_network_frr_v6" + fi + else + # disable the default route to make sure the container only routes accross + # directly connected or learnt networks (doing this at the very end since + # docker changes the routing table when a new network is connected) + $OCI_BIN exec frr ip route delete default + $OCI_BIN exec frr ip route + $OCI_BIN exec frr ip -6 route delete default + $OCI_BIN exec frr ip -6 route + fi +} + +set_nodes_default_gw() { + local gw="$1" + local ip_cmd="ip" + local route_cmd="route replace default via" + + # Check if $gw is IPv6 (contains ':') + if [[ "$gw" == *:* ]]; then + ip_cmd="ip -6" + fi + + KIND_NODES=$(kind_get_nodes) + for node in $KIND_NODES; do + $OCI_BIN exec "$node" $ip_cmd $route_cmd "$gw" + done } destroy_bgp() { diff --git a/contrib/kind.sh b/contrib/kind.sh index af1c0f537c..ab99a577bd 100755 --- a/contrib/kind.sh +++ b/contrib/kind.sh @@ -84,6 +84,7 @@ usage() { echo " [-ic | --enable-interconnect]" echo " [-uae | --preconfigured-udn-addresses-enable]" echo " [-rae | --enable-route-advertisements]" + echo " [-rud | --routed-udn-isolation-disable]" echo " [-adv | --advertise-default-network]" echo " [-nqe | --network-qos-enable]" echo " [--isolated]" @@ -158,6 +159,7 @@ echo "-obs | --observability Enable OVN Observability fea echo "-uae | --preconfigured-udn-addresses-enable Enable connecting workloads with preconfigured network to user-defined networks" echo "-rae | --enable-route-advertisements Enable route advertisements" echo "-adv | --advertise-default-network Applies a RouteAdvertisements configuration to advertise the default network on all nodes" +echo "-rud | --routed-udn-isolation-disable Disable isolation across BGP-advertised UDNs (sets advertised-udn-isolation-mode=loose). DEFAULT: strict." echo "" } @@ -347,6 +349,8 @@ parse_args() { ;; -adv | --advertise-default-network) ADVERTISE_DEFAULT_NETWORK=true ;; + -rud | --routed-udn-isolation-disable) ADVERTISED_UDN_ISOLATION_MODE=loose + ;; -ic | --enable-interconnect ) OVN_ENABLE_INTERCONNECT=true ;; --disable-ovnkube-identity) OVN_ENABLE_OVNKUBE_IDENTITY=false @@ -439,6 +443,7 @@ print_params() { echo "ENABLE_MULTI_NET = $ENABLE_MULTI_NET" echo "ENABLE_NETWORK_SEGMENTATION= $ENABLE_NETWORK_SEGMENTATION" echo "ENABLE_ROUTE_ADVERTISEMENTS= $ENABLE_ROUTE_ADVERTISEMENTS" + echo "ADVERTISED_UDN_ISOLATION_MODE= $ADVERTISED_UDN_ISOLATION_MODE" echo "ADVERTISE_DEFAULT_NETWORK = $ADVERTISE_DEFAULT_NETWORK" echo "ENABLE_PRE_CONF_UDN_ADDR = $ENABLE_PRE_CONF_UDN_ADDR" echo "OVN_ENABLE_INTERCONNECT = $OVN_ENABLE_INTERCONNECT" @@ -685,6 +690,7 @@ set_default_params() { echo "Preconfigured UDN addresses requires interconnect to be enabled (-ic)" exit 1 fi + ADVERTISED_UDN_ISOLATION_MODE=${ADVERTISED_UDN_ISOLATION_MODE:-strict} ADVERTISE_DEFAULT_NETWORK=${ADVERTISE_DEFAULT_NETWORK:-false} OVN_COMPACT_MODE=${OVN_COMPACT_MODE:-false} if [ "$OVN_COMPACT_MODE" == true ]; then @@ -941,6 +947,7 @@ create_ovn_kube_manifests() { --preconfigured-udn-addresses-enable="${ENABLE_PRE_CONF_UDN_ADDR}" \ --route-advertisements-enable="${ENABLE_ROUTE_ADVERTISEMENTS}" \ --advertise-default-network="${ADVERTISE_DEFAULT_NETWORK}" \ + --advertised-udn-isolation-mode="${ADVERTISED_UDN_ISOLATION_MODE}" \ --ovnkube-metrics-scale-enable="${OVN_METRICS_SCALE_ENABLE}" \ --compact-mode="${OVN_COMPACT_MODE}" \ --enable-interconnect="${OVN_ENABLE_INTERCONNECT}" \ diff --git a/dist/images/daemonset.sh b/dist/images/daemonset.sh index 0613a37238..ed11d09905 100755 --- a/dist/images/daemonset.sh +++ b/dist/images/daemonset.sh @@ -74,6 +74,7 @@ OVN_NETWORK_SEGMENTATION_ENABLE= OVN_PRE_CONF_UDN_ADDR_ENABLE= OVN_ROUTE_ADVERTISEMENTS_ENABLE= OVN_ADVERTISE_DEFAULT_NETWORK= +OVN_ADVERTISED_UDN_ISOLATION_MODE= OVN_V4_JOIN_SUBNET="" OVN_V6_JOIN_SUBNET="" OVN_V4_MASQUERADE_SUBNET="" @@ -283,6 +284,9 @@ while [ "$1" != "" ]; do --advertise-default-network) OVN_ADVERTISE_DEFAULT_NETWORK=$VALUE ;; + --advertised-udn-isolation-mode) + OVN_ADVERTISED_UDN_ISOLATION_MODE=$VALUE + ;; --egress-service-enable) OVN_EGRESSSERVICE_ENABLE=$VALUE ;; @@ -478,6 +482,8 @@ ovn_route_advertisements_enable=${OVN_ROUTE_ADVERTISEMENTS_ENABLE} echo "ovn_route_advertisements_enable: ${ovn_route_advertisements_enable}" ovn_advertise_default_network=${OVN_ADVERTISE_DEFAULT_NETWORK} echo "ovn_advertise_default_network: ${ovn_advertise_default_network}" +ovn_advertised_udn_isolation_mode=${OVN_ADVERTISED_UDN_ISOLATION_MODE} +echo "ovn_advertised_udn_isolation_mode: ${ovn_advertised_udn_isolation_mode}" ovn_hybrid_overlay_net_cidr=${OVN_HYBRID_OVERLAY_NET_CIDR} echo "ovn_hybrid_overlay_net_cidr: ${ovn_hybrid_overlay_net_cidr}" ovn_disable_snat_multiple_gws=${OVN_DISABLE_SNAT_MULTIPLE_GWS} @@ -620,6 +626,7 @@ ovn_image=${ovnkube_image} \ ovn_network_segmentation_enable=${ovn_network_segmentation_enable} \ ovn_pre_conf_udn_addr_enable=${ovn_pre_conf_udn_addr_enable} \ ovn_route_advertisements_enable=${ovn_route_advertisements_enable} \ + ovn_advertised_udn_isolation_mode=${ovn_advertised_udn_isolation_mode} \ ovn_egress_service_enable=${ovn_egress_service_enable} \ ovn_ssl_en=${ovn_ssl_en} \ ovn_remote_probe_interval=${ovn_remote_probe_interval} \ @@ -674,6 +681,7 @@ ovn_image=${ovnkube_image} \ ovn_multi_network_enable=${ovn_multi_network_enable} \ ovn_network_segmentation_enable=${ovn_network_segmentation_enable} \ ovn_route_advertisements_enable=${ovn_route_advertisements_enable} \ + ovn_advertised_udn_isolation_mode=${ovn_advertised_udn_isolation_mode} \ ovn_egress_service_enable=${ovn_egress_service_enable} \ ovn_ssl_en=${ovn_ssl_en} \ ovn_remote_probe_interval=${ovn_remote_probe_interval} \ @@ -773,6 +781,7 @@ ovn_image=${ovnkube_image} \ ovn_multi_network_enable=${ovn_multi_network_enable} \ ovn_network_segmentation_enable=${ovn_network_segmentation_enable} \ ovn_route_advertisements_enable=${ovn_route_advertisements_enable} \ + ovn_advertised_udn_isolation_mode=${ovn_advertised_udn_isolation_mode} \ ovn_egress_service_enable=${ovn_egress_service_enable} \ ovn_ssl_en=${ovn_ssl_en} \ ovn_master_count=${ovn_master_count} \ @@ -823,6 +832,7 @@ ovn_image=${ovnkube_image} \ ovn_network_segmentation_enable=${ovn_network_segmentation_enable} \ ovn_pre_conf_udn_addr_enable=${ovn_pre_conf_udn_addr_enable} \ ovn_route_advertisements_enable=${ovn_route_advertisements_enable} \ + ovn_advertised_udn_isolation_mode=${ovn_advertised_udn_isolation_mode} \ ovn_egress_service_enable=${ovn_egress_service_enable} \ ovn_ssl_en=${ovn_ssl_en} \ ovn_master_count=${ovn_master_count} \ @@ -904,6 +914,7 @@ ovn_image=${ovnkube_image} \ ovn_network_segmentation_enable=${ovn_network_segmentation_enable} \ ovn_pre_conf_udn_addr_enable=${ovn_pre_conf_udn_addr_enable} \ ovn_route_advertisements_enable=${ovn_route_advertisements_enable} \ + ovn_advertised_udn_isolation_mode=${ovn_advertised_udn_isolation_mode} \ ovn_egress_service_enable=${ovn_egress_service_enable} \ ovn_ssl_en=${ovn_ssl_en} \ ovn_remote_probe_interval=${ovn_remote_probe_interval} \ @@ -972,6 +983,7 @@ ovn_image=${ovnkube_image} \ ovn_network_segmentation_enable=${ovn_network_segmentation_enable} \ ovn_pre_conf_udn_addr_enable=${ovn_pre_conf_udn_addr_enable} \ ovn_route_advertisements_enable=${ovn_route_advertisements_enable} \ + ovn_advertised_udn_isolation_mode=${ovn_advertised_udn_isolation_mode} \ ovn_ssl_en=${ovn_ssl_en} \ ovn_remote_probe_interval=${ovn_remote_probe_interval} \ ovn_monitor_all=${ovn_monitor_all} \ @@ -1070,11 +1082,13 @@ ovn_network_segmentation_enable=${ovn_network_segmentation_enable} \ ovn_pre_conf_udn_addr_enable=${ovn_pre_conf_udn_addr_enable} \ ovn_enable_dnsnameresolver=${ovn_enable_dnsnameresolver} \ ovn_route_advertisements_enable=${ovn_route_advertisements_enable} \ +ovn_advertised_udn_isolation_mode=${ovn_advertised_udn_isolation_mode} \ jinjanate ../templates/rbac-ovnkube-cluster-manager.yaml.j2 -o ${output_dir}/rbac-ovnkube-cluster-manager.yaml ovn_network_segmentation_enable=${ovn_network_segmentation_enable} \ ovn_enable_dnsnameresolver=${ovn_enable_dnsnameresolver} \ ovn_route_advertisements_enable=${ovn_route_advertisements_enable} \ +ovn_advertised_udn_isolation_mode=${ovn_advertised_udn_isolation_mode} \ jinjanate ../templates/rbac-ovnkube-master.yaml.j2 -o ${output_dir}/rbac-ovnkube-master.yaml cp ../templates/rbac-ovnkube-identity.yaml.j2 ${output_dir}/rbac-ovnkube-identity.yaml diff --git a/dist/images/ovnkube.sh b/dist/images/ovnkube.sh index e016ce4a47..73e1f42ead 100755 --- a/dist/images/ovnkube.sh +++ b/dist/images/ovnkube.sh @@ -273,6 +273,8 @@ ovn_network_segmentation_enable=${OVN_NETWORK_SEGMENTATION_ENABLE:=false} ovn_pre_conf_udn_addr_enable=${OVN_PRE_CONF_UDN_ADDR_ENABLE:=false} #OVN_NROUTE_ADVERTISEMENTS_ENABLE - enable route advertisements for ovn-kubernetes ovn_route_advertisements_enable=${OVN_ROUTE_ADVERTISEMENTS_ENABLE:=false} +#OVN_ADVERTISED_UDN_ISOLATION_MODE - pod network isolation between advertised UDN networks. +ovn_advertised_udn_isolation_mode=${OVN_ADVERTISED_UDN_ISOLATION_MODE:=strict} ovn_acl_logging_rate_limit=${OVN_ACL_LOGGING_RATE_LIMIT:-"20"} ovn_netflow_targets=${OVN_NETFLOW_TARGETS:-} ovn_sflow_targets=${OVN_SFLOW_TARGETS:-} @@ -1249,6 +1251,11 @@ ovn-master() { fi echo "route_advertisements_enabled_flag=${route_advertisements_enabled_flag}" + advertised_udn_isolation_flag= + if [[ -n ${ovn_advertised_udn_isolation_mode} ]]; then + advertised_udn_isolation_flag="--advertised-udn-isolation-mode=${ovn_advertised_udn_isolation_mode}" + fi + egressservice_enabled_flag= if [[ ${ovn_egressservice_enable} == "true" ]]; then egressservice_enabled_flag="--enable-egress-service" @@ -1356,6 +1363,7 @@ ovn-master() { ${multi_network_enabled_flag} \ ${network_segmentation_enabled_flag} \ ${route_advertisements_enabled_flag} \ + ${advertised_udn_isolation_flag} \ ${ovn_acl_logging_rate_limit_flag} \ ${ovn_enable_svc_template_support_flag} \ ${ovn_observ_enable_flag} \ @@ -1558,6 +1566,12 @@ ovnkube-controller() { fi echo "route_advertisements_enabled_flag=${route_advertisements_enabled_flag}" + advertised_udn_isolation_flag= + if [[ -n ${ovn_advertised_udn_isolation_mode} ]]; then + advertised_udn_isolation_flag="--advertised-udn-isolation-mode=${ovn_advertised_udn_isolation_mode}" + fi + echo "advertised_udn_isolation_flag=${advertised_udn_isolation_flag}" + egressservice_enabled_flag= if [[ ${ovn_egressservice_enable} == "true" ]]; then egressservice_enabled_flag="--enable-egress-service" @@ -1674,6 +1688,7 @@ ovnkube-controller() { ${network_segmentation_enabled_flag} \ ${pre_conf_udn_addr_enable_flag} \ ${route_advertisements_enabled_flag} \ + ${advertised_udn_isolation_flag} \ ${ovn_acl_logging_rate_limit_flag} \ ${ovn_dbs} \ ${ovn_enable_svc_template_support_flag} \ @@ -1869,6 +1884,12 @@ ovnkube-controller-with-node() { fi echo "route_advertisements_enabled_flag=${route_advertisements_enabled_flag}" + advertised_udn_isolation_flag= + if [[ -n ${ovn_advertised_udn_isolation_mode} ]]; then + advertised_udn_isolation_flag="--advertised-udn-isolation-mode=${ovn_advertised_udn_isolation_mode}" + fi + echo "advertised_udn_isolation_flag=${advertised_udn_isolation_flag}" + egressservice_enabled_flag= if [[ ${ovn_egressservice_enable} == "true" ]]; then egressservice_enabled_flag="--enable-egress-service" @@ -2053,6 +2074,7 @@ ovnkube-controller-with-node() { ovnkube_metrics_scale_enable_flag="--metrics-enable-scale --metrics-enable-pprof" fi echo "ovnkube_metrics_scale_enable_flag: ${ovnkube_metrics_scale_enable_flag}" + ovnkube_local_cert_flags= if [[ ${ovn_enable_ovnkube_identity} == "true" ]]; then bootstrap_kubeconfig="/host-kubernetes/kubelet.conf" @@ -2130,6 +2152,7 @@ ovnkube-controller-with-node() { ${network_segmentation_enabled_flag} \ ${pre_conf_udn_addr_enable_flag} \ ${route_advertisements_enabled_flag} \ + ${advertised_udn_isolation_flag} \ ${netflow_targets} \ ${ofctrl_wait_before_clear} \ ${ovn_acl_logging_rate_limit_flag} \ @@ -2308,6 +2331,11 @@ ovn-cluster-manager() { fi echo "route_advertisements_enabled_flag=${route_advertisements_enabled_flag}" + advertised_udn_isolation_flag= + if [[ -n ${ovn_advertised_udn_isolation_mode} ]]; then + advertised_udn_isolation_flag="--advertised-udn-isolation-mode=${ovn_advertised_udn_isolation_mode}" + fi + persistent_ips_enabled_flag= if [[ ${ovn_enable_persistent_ips} == "true" ]]; then persistent_ips_enabled_flag="--enable-persistent-ips" @@ -2371,6 +2399,7 @@ ovn-cluster-manager() { ${network_segmentation_enabled_flag} \ ${pre_conf_udn_addr_enable_flag} \ ${route_advertisements_enabled_flag} \ + ${advertised_udn_isolation_flag} \ ${persistent_ips_enabled_flag} \ ${ovnkube_enable_interconnect_flag} \ ${ovnkube_enable_multi_external_gateway_flag} \ @@ -2557,6 +2586,11 @@ ovn-node() { route_advertisements_enabled_flag="--enable-route-advertisements" fi + advertised_udn_isolation_flag= + if [[ -n ${ovn_advertised_udn_isolation_mode} ]]; then + advertised_udn_isolation_flag="--advertised-udn-isolation-mode=${ovn_advertised_udn_isolation_mode}" + fi + netflow_targets= if [[ -n ${ovn_netflow_targets} ]]; then netflow_targets="--netflow-targets ${ovn_netflow_targets}" @@ -2789,6 +2823,7 @@ ovn-node() { ${network_segmentation_enabled_flag} \ ${pre_conf_udn_addr_enable_flag} \ ${route_advertisements_enabled_flag} \ + ${advertised_udn_isolation_flag} \ ${netflow_targets} \ ${ofctrl_wait_before_clear} \ ${ovn_dbs} \ diff --git a/dist/templates/ovnkube-control-plane.yaml.j2 b/dist/templates/ovnkube-control-plane.yaml.j2 index 13403ceb31..a86394aac7 100644 --- a/dist/templates/ovnkube-control-plane.yaml.j2 +++ b/dist/templates/ovnkube-control-plane.yaml.j2 @@ -150,6 +150,8 @@ spec: value: "{{ ovn_pre_conf_udn_addr_enable }}" - name: OVN_ROUTE_ADVERTISEMENTS_ENABLE value: "{{ ovn_route_advertisements_enable }}" + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: "{{ ovn_advertised_udn_isolation_mode }}" - name: OVN_HYBRID_OVERLAY_NET_CIDR value: "{{ ovn_hybrid_overlay_net_cidr }}" - name: OVN_DISABLE_SNAT_MULTIPLE_GWS diff --git a/dist/templates/ovnkube-master.yaml.j2 b/dist/templates/ovnkube-master.yaml.j2 index 47ea81a6dc..bbe2156940 100644 --- a/dist/templates/ovnkube-master.yaml.j2 +++ b/dist/templates/ovnkube-master.yaml.j2 @@ -261,6 +261,8 @@ spec: value: "{{ ovn_network_segmentation_enable }}" - name: OVN_ROUTE_ADVERTISEMENTS_ENABLE value: "{{ ovn_route_advertisements_enable }}" + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: "{{ ovn_advertised_udn_isolation_mode }}" - name: OVN_EGRESSSERVICE_ENABLE value: "{{ ovn_egress_service_enable }}" - name: OVN_HYBRID_OVERLAY_NET_CIDR diff --git a/dist/templates/ovnkube-node.yaml.j2 b/dist/templates/ovnkube-node.yaml.j2 index 2bf8e5ed2a..ba77eb8272 100644 --- a/dist/templates/ovnkube-node.yaml.j2 +++ b/dist/templates/ovnkube-node.yaml.j2 @@ -242,6 +242,8 @@ spec: value: "{{ ovn_network_segmentation_enable }}" - name: OVN_ROUTE_ADVERTISEMENTS_ENABLE value: "{{ ovn_route_advertisements_enable }}" + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: "{{ ovn_advertised_udn_isolation_mode }}" - name: OVN_ENABLE_INTERCONNECT value: "{{ ovn_enable_interconnect }}" - name: OVN_ENABLE_MULTI_EXTERNAL_GATEWAY diff --git a/dist/templates/ovnkube-single-node-zone.yaml.j2 b/dist/templates/ovnkube-single-node-zone.yaml.j2 index ad4e4488f9..b7565acdd5 100644 --- a/dist/templates/ovnkube-single-node-zone.yaml.j2 +++ b/dist/templates/ovnkube-single-node-zone.yaml.j2 @@ -441,6 +441,8 @@ spec: value: "{{ ovn_pre_conf_udn_addr_enable }}" - name: OVN_ROUTE_ADVERTISEMENTS_ENABLE value: "{{ ovn_route_advertisements_enable }}" + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: "{{ ovn_advertised_udn_isolation_mode }}" - name: OVNKUBE_NODE_MGMT_PORT_NETDEV value: "{{ ovnkube_node_mgmt_port_netdev }}" - name: OVN_EMPTY_LB_EVENTS diff --git a/dist/templates/ovnkube-zone-controller.yaml.j2 b/dist/templates/ovnkube-zone-controller.yaml.j2 index cc87fe1a53..6b928afe4d 100644 --- a/dist/templates/ovnkube-zone-controller.yaml.j2 +++ b/dist/templates/ovnkube-zone-controller.yaml.j2 @@ -349,6 +349,8 @@ spec: value: "{{ ovn_pre_conf_udn_addr_enable }}" - name: OVN_ROUTE_ADVERTISEMENTS_ENABLE value: "{{ ovn_route_advertisements_enable }}" + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: "{{ ovn_advertised_udn_isolation_mode }}" - name: OVN_HYBRID_OVERLAY_NET_CIDR value: "{{ ovn_hybrid_overlay_net_cidr }}" - name: OVN_DISABLE_SNAT_MULTIPLE_GWS diff --git a/docs/features/cluster-egress-controls/egress-ip.md b/docs/features/cluster-egress-controls/egress-ip.md index d4cbd30b8c..f5c02232f5 100644 --- a/docs/features/cluster-egress-controls/egress-ip.md +++ b/docs/features/cluster-egress-controls/egress-ip.md @@ -204,11 +204,11 @@ Due to the fact that ovn-controllers on different nodes apply the changes indepe there is a chance that some pod traffic will reach the egress node before it configures the SNAT rules. The following flows are added on breth0 to address this scenario: ```shell -# Commit connections from local pods so they are not affected by the drop rule below, this is required for ICNIv2 -priority=109,ip,in_port=2,nw_src= actions=ct(commit,zone=64000,exec(set_field:0x1->ct_mark)),output:1 +# Output traffic from local pods so they are not affected by the drop rule below, this is required for ICNIv2 and advertised UDNs +priority=104,ip,in_port=2,nw_src= actions=output:eth0 # Drop non SNATed egress traffic coming from non-local pods -priority=104,ip,in_port=2,nw_src= actions=drop +priority=103,ip,in_port=2,nw_src= actions=drop # Commit connections coming from IPs not in cluster network priority=100,ip,in_port=2 actions=ct(commit,zone=64000,exec(set_field:0x1->ct_mark)),output:1 diff --git a/go-controller/pkg/clustermanager/routeadvertisements/controller.go b/go-controller/pkg/clustermanager/routeadvertisements/controller.go index 18fb3dbaae..774a5341f3 100644 --- a/go-controller/pkg/clustermanager/routeadvertisements/controller.go +++ b/go-controller/pkg/clustermanager/routeadvertisements/controller.go @@ -593,14 +593,13 @@ func (c *Controller) generateFRRConfiguration( matchedNetworks sets.Set[string], ) (*frrtypes.FRRConfiguration, error) { routers := []frrtypes.Router{} - advertisements := sets.New(ra.Spec.Advertisements...) // go over the source routers for i, router := range source.Spec.BGP.Routers { targetVRF := ra.Spec.TargetVRF var matchedVRF, matchedNetwork string - var receivePrefixes, advertisePrefixes []string + var advertisePrefixes []string // We will use the router if: // - the router VRF matches the target VRF @@ -608,33 +607,25 @@ func (c *Controller) generateFRRConfiguration( // Prepare each scenario with a switch statement and check after that switch { case targetVRF == "auto" && router.VRF == "": - // match on default network/VRF, advertise node prefixes and receive - // any prefix of default network. + // match on default network/VRF, advertise node prefixes matchedVRF = "" matchedNetwork = types.DefaultNetworkName advertisePrefixes = selectedNetworks.hostNetworkSubnets[matchedNetwork] - receivePrefixes = selectedNetworks.networkSubnets[matchedNetwork] case targetVRF == "auto": - // match router.VRF to network.VRF, advertise node prefixes and - // receive any prefix of the matched network + // match router.VRF to network.VRF, advertise node prefixes matchedVRF = router.VRF matchedNetwork = selectedNetworks.networkVRFs[matchedVRF] advertisePrefixes = selectedNetworks.hostNetworkSubnets[matchedNetwork] - receivePrefixes = selectedNetworks.networkSubnets[matchedNetwork] case targetVRF == "": - // match on default network/VRF, advertise node prefixes and - // receive any prefix of selected networks + // match on default network/VRF, advertise node prefixes matchedVRF = "" matchedNetwork = types.DefaultNetworkName advertisePrefixes = selectedNetworks.hostSubnets - receivePrefixes = selectedNetworks.subnets default: - // match router.VRF to network.VRF, advertise node prefixes and - // receive any prefix of selected networks + // match router.VRF to network.VRF, advertise node prefixes matchedVRF = targetVRF matchedNetwork = selectedNetworks.networkVRFs[matchedVRF] advertisePrefixes = selectedNetworks.hostSubnets - receivePrefixes = selectedNetworks.subnets } if matchedVRF != router.VRF || len(advertisePrefixes) == 0 { // either this router VRF does not match the target VRF or we don't @@ -669,7 +660,6 @@ func (c *Controller) generateFRRConfiguration( isIPV6 := utilnet.IsIPv6String(neighbor.Address) advertisePrefixes := util.MatchAllIPNetsStringFamily(isIPV6, advertisePrefixes) - receivePrefixes := util.MatchAllIPNetsStringFamily(isIPV6, receivePrefixes) if len(advertisePrefixes) == 0 { continue } @@ -680,22 +670,6 @@ func (c *Controller) generateFRRConfiguration( Prefixes: advertisePrefixes, }, } - neighbor.ToReceive = frrtypes.Receive{ - Allowed: frrtypes.AllowedInPrefixes{ - Mode: frrtypes.AllowRestricted, - }, - } - if advertisements.Has(ratypes.PodNetwork) { - for _, prefix := range receivePrefixes { - neighbor.ToReceive.Allowed.Prefixes = append(neighbor.ToReceive.Allowed.Prefixes, - frrtypes.PrefixSelector{ - Prefix: prefix, - LE: selectedNetworks.prefixLength[prefix], - GE: selectedNetworks.prefixLength[prefix], - }, - ) - } - } targetRouter.Neighbors = append(targetRouter.Neighbors, neighbor) } if len(targetRouter.Neighbors) == 0 { diff --git a/go-controller/pkg/clustermanager/routeadvertisements/controller_test.go b/go-controller/pkg/clustermanager/routeadvertisements/controller_test.go index 305418425c..cf8d58729f 100644 --- a/go-controller/pkg/clustermanager/routeadvertisements/controller_test.go +++ b/go-controller/pkg/clustermanager/routeadvertisements/controller_test.go @@ -152,7 +152,6 @@ type testNeighbor struct { ASN uint32 Address string DisableMP *bool - Receive []string Advertise []string } @@ -161,11 +160,6 @@ func (tn testNeighbor) Neighbor() frrapi.Neighbor { ASN: tn.ASN, Address: tn.Address, DisableMP: true, - ToReceive: frrapi.Receive{ - Allowed: frrapi.AllowedInPrefixes{ - Mode: frrapi.AllowRestricted, - }, - }, ToAdvertise: frrapi.Advertise{ Allowed: frrapi.AllowedOutPrefixes{ Mode: frrapi.AllowRestricted, @@ -176,31 +170,6 @@ func (tn testNeighbor) Neighbor() frrapi.Neighbor { if tn.DisableMP != nil { n.DisableMP = *tn.DisableMP } - for _, receive := range tn.Receive { - sep := strings.LastIndex(receive, "/") - if sep == -1 { - continue - } - if isLayer2 := strings.Count(receive, "/") == 1; isLayer2 { - n.ToReceive.Allowed.Prefixes = append(n.ToReceive.Allowed.Prefixes, - frrapi.PrefixSelector{ - Prefix: receive, - }, - ) - continue - } - - first := receive[:sep] - last := receive[sep+1:] - len := ovntest.MustAtoi(last) - n.ToReceive.Allowed.Prefixes = append(n.ToReceive.Allowed.Prefixes, - frrapi.PrefixSelector{ - Prefix: first, - GE: uint32(len), - LE: uint32(len), - }, - ) - } return n } @@ -433,7 +402,7 @@ func TestController_reconcile(t *testing.T) { NodeSelector: map[string]string{"kubernetes.io/hostname": "node"}, Routers: []*testRouter{ {ASN: 1, Prefixes: []string{"1.0.1.1/32", "1.1.0.0/24"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.0.1.1/32", "1.1.0.0/24"}, Receive: []string{"1.1.0.0/16/24"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.0.1.1/32", "1.1.0.0/24"}}, }}, }}, }, @@ -465,8 +434,8 @@ func TestController_reconcile(t *testing.T) { NodeSelector: map[string]string{"kubernetes.io/hostname": "node"}, Routers: []*testRouter{ {ASN: 1, Prefixes: []string{"1.0.1.1/32", "1.1.0.0/24", "fd01::/64", "fd03::ffff:100:101/128"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.0.1.1/32", "1.1.0.0/24"}, Receive: []string{"1.1.0.0/16/24"}}, - {ASN: 1, Address: "fd02::ffff:100:64", Advertise: []string{"fd01::/64", "fd03::ffff:100:101/128"}, Receive: []string{"fd01::/48/64"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.0.1.1/32", "1.1.0.0/24"}}, + {ASN: 1, Address: "fd02::ffff:100:64", Advertise: []string{"fd01::/64", "fd03::ffff:100:101/128"}}, }}, }}, }, @@ -503,7 +472,7 @@ func TestController_reconcile(t *testing.T) { NodeSelector: map[string]string{"kubernetes.io/hostname": "node"}, Routers: []*testRouter{ {ASN: 1, Prefixes: []string{"1.2.0.0/24", "1.3.0.0/24", "1.4.0.0/16", "1.5.0.0/16"}, Imports: []string{"black", "blue", "green", "red"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.2.0.0/24", "1.3.0.0/24", "1.4.0.0/16", "1.5.0.0/16"}, Receive: []string{"1.2.0.0/16/24", "1.3.0.0/16/24", "1.4.0.0/16", "1.5.0.0/16"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.2.0.0/24", "1.3.0.0/24", "1.4.0.0/16", "1.5.0.0/16"}}, }}, {ASN: 1, VRF: "black", Imports: []string{"default"}}, {ASN: 1, VRF: "blue", Imports: []string{"default"}}, @@ -636,7 +605,7 @@ func TestController_reconcile(t *testing.T) { NodeSelector: map[string]string{"kubernetes.io/hostname": "node"}, Routers: []*testRouter{ {ASN: 1, Prefixes: []string{"1.0.1.1/32", "1.1.0.0/24"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.0.1.1/32", "1.1.0.0/24"}, Receive: []string{"1.1.0.0/16/24"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.0.1.1/32", "1.1.0.0/24"}}, }}, }, }, @@ -744,13 +713,13 @@ func TestController_reconcile(t *testing.T) { NodeSelector: map[string]string{"kubernetes.io/hostname": "node1"}, Routers: []*testRouter{ {ASN: 1, Prefixes: []string{"1.1.1.0/24"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.1.1.0/24"}, Receive: []string{"1.1.0.0/16/24"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.1.1.0/24"}}, }}, {ASN: 1, VRF: "red", Prefixes: []string{"1.2.1.0/24"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.2.1.0/24"}, Receive: []string{"1.2.0.0/16/24"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.2.1.0/24"}}, }}, {ASN: 1, VRF: "green", Prefixes: []string{"1.4.0.0/16"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.4.0.0/16"}, Receive: []string{"1.4.0.0/16"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.4.0.0/16"}}, }}, }, }, @@ -760,7 +729,7 @@ func TestController_reconcile(t *testing.T) { NodeSelector: map[string]string{"kubernetes.io/hostname": "node2"}, Routers: []*testRouter{ {ASN: 1, Prefixes: []string{"1.1.2.0/24"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.1.2.0/24"}, Receive: []string{"1.1.0.0/16/24"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.1.2.0/24"}}, }}, }, }, @@ -770,10 +739,10 @@ func TestController_reconcile(t *testing.T) { NodeSelector: map[string]string{"kubernetes.io/hostname": "node2"}, Routers: []*testRouter{ {ASN: 1, VRF: "red", Prefixes: []string{"1.2.2.0/24"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.2.2.0/24"}, Receive: []string{"1.2.0.0/16/24"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.2.2.0/24"}}, }}, {ASN: 1, VRF: "green", Prefixes: []string{"1.4.0.0/16"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.4.0.0/16"}, Receive: []string{"1.4.0.0/16"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.4.0.0/16"}}, }}, }, }, @@ -799,7 +768,7 @@ func TestController_reconcile(t *testing.T) { NodeSelector: map[string]string{"kubernetes.io/hostname": "node"}, Routers: []*testRouter{ {ASN: 1, Prefixes: []string{"1.0.1.1/32", "1.1.0.0/24"}, Neighbors: []*testNeighbor{ - {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.0.1.1/32", "1.1.0.0/24"}, Receive: []string{"1.1.0.0/16/24"}}, + {ASN: 1, Address: "1.0.0.100", Advertise: []string{"1.0.1.1/32", "1.1.0.0/24"}}, }}, }, }, diff --git a/go-controller/pkg/config/config.go b/go-controller/pkg/config/config.go index 89757b864b..27aa77ad37 100644 --- a/go-controller/pkg/config/config.go +++ b/go-controller/pkg/config/config.go @@ -134,6 +134,7 @@ var ( // OVNKubernetesFeatureConfig holds OVN-Kubernetes feature enhancement config file parameters and command-line overrides OVNKubernetesFeature = OVNKubernetesFeatureConfig{ EgressIPReachabiltyTotalTimeout: 1, + AdvertisedUDNIsolationMode: AdvertisedUDNIsolationModeStrict, } // OvnNorth holds northbound OVN database client and server authentication and location details @@ -434,6 +435,9 @@ type OVNKubernetesFeatureConfig struct { EnableServiceTemplateSupport bool `gcfg:"enable-svc-template-support"` EnableObservability bool `gcfg:"enable-observability"` EnableNetworkQoS bool `gcfg:"enable-network-qos"` + // This feature requires a kernel fix https://github.com/torvalds/linux/commit/7f3287db654395f9c5ddd246325ff7889f550286 + // to work on a kind cluster. Flag allows to disable it for current CI, will be turned on when github runners have this fix. + AdvertisedUDNIsolationMode string `gcfg:"advertised-udn-isolation-mode"` } // GatewayMode holds the node gateway mode @@ -448,6 +452,13 @@ const ( GatewayModeLocal GatewayMode = "local" ) +const ( + // AdvertisedUDNIsolationModeStrict pod isolation across advertised UDN networks is enabled. + AdvertisedUDNIsolationModeStrict = "strict" + // AdvertisedUDNIsolationModeLoose pod isolation across advertised UDN networks is disabled. + AdvertisedUDNIsolationModeLoose = "loose" +) + // GatewayConfig holds node gateway-related parsed config file parameters and command-line overrides type GatewayConfig struct { // Mode is the gateway mode; if may be either empty (disabled), "shared", or "local" @@ -498,7 +509,7 @@ type GatewayConfig struct { // EphemeralPortRange is the range of ports used by egress SNAT operations in OVN. Specifically for NAT where // the source IP of the NAT will be a shared Node IP address. If unset, the value will be determined by sysctl lookup // for the kernel's ephemeral range: net.ipv4.ip_local_port_range. Format is "-". - EphemeralPortRange string `gfcg:"ephemeral-port-range"` + EphemeralPortRange string `gcfg:"ephemeral-port-range"` } // OvnAuthConfig holds client authentication and location details for @@ -1102,6 +1113,12 @@ var OVNK8sFeatureFlags = []cli.Flag{ Destination: &cliConfig.OVNKubernetesFeature.EnableRouteAdvertisements, Value: OVNKubernetesFeature.EnableRouteAdvertisements, }, + &cli.StringFlag{ + Name: "advertised-udn-isolation-mode", + Usage: "Configure to use pod isolation for BGP advertised UDN networks. Valid values are 'strict' or 'loose'.", + Destination: &cliConfig.OVNKubernetesFeature.AdvertisedUDNIsolationMode, + Value: OVNKubernetesFeature.AdvertisedUDNIsolationMode, + }, &cli.BoolFlag{ Name: "enable-stateless-netpol", Usage: "Configure to use stateless network policy feature with ovn-kubernetes.", @@ -2014,6 +2031,10 @@ func buildOVNKubernetesFeatureConfig(cli, file *config) error { if err := overrideFields(&OVNKubernetesFeature, &cli.OVNKubernetesFeature, &savedOVNKubernetesFeature); err != nil { return err } + if OVNKubernetesFeature.AdvertisedUDNIsolationMode != AdvertisedUDNIsolationModeStrict && OVNKubernetesFeature.AdvertisedUDNIsolationMode != AdvertisedUDNIsolationModeLoose { + return fmt.Errorf("invalid advertised-udn-isolation-mode %q: expect one of %s or %s", + OVNKubernetesFeature.AdvertisedUDNIsolationMode, AdvertisedUDNIsolationModeStrict, AdvertisedUDNIsolationModeLoose) + } return nil } diff --git a/go-controller/pkg/config/config_test.go b/go-controller/pkg/config/config_test.go index 39e7fa41bc..c5a032c92c 100644 --- a/go-controller/pkg/config/config_test.go +++ b/go-controller/pkg/config/config_test.go @@ -229,6 +229,7 @@ enable-multi-networkpolicy=false enable-network-segmentation=false enable-preconfigured-udn-addresses=false enable-route-advertisements=false +advertised-udn-isolation-mode=strict enable-interconnect=false enable-multi-external-gateway=false enable-admin-network-policy=false @@ -346,6 +347,7 @@ var _ = Describe("Config Operations", func() { gomega.Expect(OVNKubernetesFeature.EnableMultiExternalGateway).To(gomega.BeFalse()) gomega.Expect(OVNKubernetesFeature.EnableAdminNetworkPolicy).To(gomega.BeFalse()) gomega.Expect(OVNKubernetesFeature.EnablePersistentIPs).To(gomega.BeFalse()) + gomega.Expect(OVNKubernetesFeature.AdvertisedUDNIsolationMode).To(gomega.Equal(AdvertisedUDNIsolationModeStrict)) for _, a := range []OvnAuthConfig{OvnNorth, OvnSouth} { gomega.Expect(a.Scheme).To(gomega.Equal(OvnDBSchemeUnix)) @@ -601,6 +603,7 @@ var _ = Describe("Config Operations", func() { "enable-network-segmentation=true", "enable-preconfigured-udn-addresses=true", "enable-route-advertisements=true", + "advertised-udn-isolation-mode=loose", "enable-interconnect=true", "enable-multi-external-gateway=true", "enable-admin-network-policy=true", @@ -692,6 +695,7 @@ var _ = Describe("Config Operations", func() { gomega.Expect(OVNKubernetesFeature.EnableNetworkSegmentation).To(gomega.BeTrue()) gomega.Expect(OVNKubernetesFeature.EnablePreconfiguredUDNAddresses).To(gomega.BeTrue()) gomega.Expect(OVNKubernetesFeature.EnableRouteAdvertisements).To(gomega.BeTrue()) + gomega.Expect(OVNKubernetesFeature.AdvertisedUDNIsolationMode).To(gomega.Equal(AdvertisedUDNIsolationModeLoose)) gomega.Expect(OVNKubernetesFeature.EnableInterconnect).To(gomega.BeTrue()) gomega.Expect(OVNKubernetesFeature.EnableMultiExternalGateway).To(gomega.BeTrue()) gomega.Expect(OVNKubernetesFeature.EnableAdminNetworkPolicy).To(gomega.BeTrue()) @@ -800,6 +804,7 @@ var _ = Describe("Config Operations", func() { gomega.Expect(OVNKubernetesFeature.EnableNetworkSegmentation).To(gomega.BeTrue()) gomega.Expect(OVNKubernetesFeature.EnablePreconfiguredUDNAddresses).To(gomega.BeTrue()) gomega.Expect(OVNKubernetesFeature.EnableRouteAdvertisements).To(gomega.BeTrue()) + gomega.Expect(OVNKubernetesFeature.AdvertisedUDNIsolationMode).To(gomega.Equal(AdvertisedUDNIsolationModeLoose)) gomega.Expect(OVNKubernetesFeature.EnableMultiNetworkPolicy).To(gomega.BeTrue()) gomega.Expect(OVNKubernetesFeature.EnableInterconnect).To(gomega.BeTrue()) gomega.Expect(OVNKubernetesFeature.EnableMultiExternalGateway).To(gomega.BeTrue()) @@ -876,6 +881,7 @@ var _ = Describe("Config Operations", func() { "-enable-network-segmentation=true", "-enable-preconfigured-udn-addresses=true", "-enable-route-advertisements=true", + "-advertised-udn-isolation-mode=loose", "-enable-interconnect=true", "-enable-multi-external-gateway=true", "-enable-admin-network-policy=true", @@ -1593,6 +1599,24 @@ foo=bar gomega.Expect(err).NotTo(gomega.HaveOccurred()) }) + It("rejects a config with invalid advertised-udn-isolation-mode", func() { + err := os.WriteFile(cfgFile.Name(), []byte(`[ovnkubernetesfeature] +advertised-udn-isolation-mode=foo +`), 0o644) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + app.Action = func(ctx *cli.Context) error { + _, err := InitConfig(ctx, kexec.New(), nil) + gomega.Expect(err).To(gomega.MatchError( + gomega.ContainSubstring("invalid advertised-udn-isolation-mode \"foo\": expect one of strict or loose")), + ) + return nil + } + cliArgs := []string{app.Name, "-config-file=" + cfgFile.Name()} + err = app.Run(cliArgs) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) + It("rejects a config with invalid syntax", func() { err := os.WriteFile(cfgFile.Name(), []byte(`[default] mtu=1234 diff --git a/go-controller/pkg/controllermanager/controller_manager.go b/go-controller/pkg/controllermanager/controller_manager.go index 27db274d05..55aaec6831 100644 --- a/go-controller/pkg/controllermanager/controller_manager.go +++ b/go-controller/pkg/controllermanager/controller_manager.go @@ -2,6 +2,7 @@ package controllermanager import ( "context" + "errors" "fmt" "sync" "time" @@ -195,15 +196,15 @@ func (cm *ControllerManager) CleanupStaleNetworks(validNetworks ...util.NetInfo) } } - if util.IsRouteAdvertisementsEnabled() { - // Remove stale subnets from the advertised networks address set used for isolation - // NOTE: network reconciliation will take care of removing the subnets for existing networks that are no longer - // advertised. - addressSetFactory := addressset.NewOvnAddressSetFactory(cm.nbClient, config.IPv4Mode, config.IPv6Mode) - advertisedSubnets, err := addressSetFactory.GetAddressSet(ovn.GetAdvertisedNetworkSubnetsAddressSetDBIDs()) - if err != nil { - return fmt.Errorf("failed to get advertised subnets addresset %s: %w", ovn.GetAdvertisedNetworkSubnetsAddressSetDBIDs(), err) - } + // Remove stale subnets from the advertised networks address set used for isolation + // NOTE: network reconciliation will take care of removing the subnets for existing networks that are no longer + // advertised. + addressSetFactory := addressset.NewOvnAddressSetFactory(cm.nbClient, config.IPv4Mode, config.IPv6Mode) + advertisedSubnets, err := addressSetFactory.GetAddressSet(ovn.GetAdvertisedNetworkSubnetsAddressSetDBIDs()) + if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { + return fmt.Errorf("failed to get advertised subnets addresset %s: %w", ovn.GetAdvertisedNetworkSubnetsAddressSetDBIDs(), err) + } + if advertisedSubnets != nil { v4AdvertisedSubnets, v6AdvertisedSubnets := advertisedSubnets.GetAddresses() var invalidSubnets []string for _, subnet := range append(v4AdvertisedSubnets, v6AdvertisedSubnets...) { diff --git a/go-controller/pkg/libovsdb/ops/router.go b/go-controller/pkg/libovsdb/ops/router.go index 18b3931a1f..5f0ce594d4 100644 --- a/go-controller/pkg/libovsdb/ops/router.go +++ b/go-controller/pkg/libovsdb/ops/router.go @@ -932,6 +932,11 @@ func RemoveLoadBalancersFromLogicalRouterOps(nbClient libovsdbclient.Client, ops return ops, err } +func getNATMutableFields(nat *nbdb.NAT) []interface{} { + return []interface{}{&nat.Type, &nat.ExternalIP, &nat.LogicalIP, &nat.LogicalPort, &nat.ExternalMAC, + &nat.ExternalIDs, &nat.Match, &nat.Options, &nat.ExternalPortRange, &nat.GatewayPort, &nat.Priority} +} + func buildNAT( natType nbdb.NATType, externalIP string, @@ -1035,7 +1040,7 @@ func BuildDNATAndSNATWithMatch( // isEquivalentNAT checks if the `searched` NAT is equivalent to `existing`. // Returns true if the UUID is set in `searched` and matches the UUID of `existing`. // Otherwise, perform the following checks: -// - Compare the Type and Match fields. +// - Compare the Type. // - Compare ExternalIP if it is set in `searched`. // - Compare LogicalIP if the Type in `searched` is SNAT. // - Compare LogicalPort if it is set in `searched`. @@ -1050,10 +1055,6 @@ func isEquivalentNAT(existing *nbdb.NAT, searched *nbdb.NAT) bool { return false } - if searched.Match != existing.Match { - return false - } - // Compare externalIP if it's not empty. if searched.ExternalIP != "" && searched.ExternalIP != existing.ExternalIP { return false @@ -1156,7 +1157,7 @@ func CreateOrUpdateNATsOps(nbClient libovsdbclient.Client, ops []ovsdb.Operation } opModel := operationModel{ Model: inputNat, - OnModelUpdates: onModelUpdatesAllNonDefault(), + OnModelUpdates: getNATMutableFields(inputNat), ErrNotFound: false, BulkOp: false, DoAfter: func() { router.Nat = append(router.Nat, inputNat.UUID) }, @@ -1284,7 +1285,7 @@ func UpdateNATOps(nbClient libovsdbclient.Client, ops []ovsdb.Operation, nats .. opModel := []operationModel{ { Model: nat, - OnModelUpdates: onModelUpdatesAllNonDefault(), + OnModelUpdates: getNATMutableFields(nat), ErrNotFound: true, BulkOp: false, }, diff --git a/go-controller/pkg/node/bridgeconfig/bridgeflows.go b/go-controller/pkg/node/bridgeconfig/bridgeflows.go index d03b88c8de..6ffae3fcec 100644 --- a/go-controller/pkg/node/bridgeconfig/bridgeflows.go +++ b/go-controller/pkg/node/bridgeconfig/bridgeflows.go @@ -349,13 +349,12 @@ func (b *BridgeConfiguration) flowsForDefaultBridge(extraIPs []net.IP) ([]string bridgeMacAddress, mod_vlan_id, defaultNetConfig.OfPortPatch)) // table 2, priority 200, dispatch from UDN -> Host -> OVN. These packets have - // already been SNATed to the UDN's masq IP or have been marked with the UDN's packet mark. + // already been SNATed to the UDN's masquerade IP or have been marked with the UDN's packet mark. if config.IPv4Mode { for _, netConfig := range b.patchedNetConfigs() { if netConfig.IsDefaultNetwork() { continue } - srcIPOrSubnet := netConfig.V4MasqIPs.ManagementPort.IP.String() if util.IsRouteAdvertisementsEnabled() && netConfig.Advertised.Load() { var udnAdvertisedSubnets []*net.IPNet for _, clusterEntry := range netConfig.Subnets { @@ -368,9 +367,14 @@ func (b *BridgeConfiguration) flowsForDefaultBridge(extraIPs []net.IP) ([]string klog.Infof("Unable to determine IPV4 UDN subnet for the provided family isIPV6: %v", err) continue } - - // Use the filtered subnets for the flow compute instead of the masqueradeIP - srcIPOrSubnet = matchingIPFamilySubnet.String() + // In addition to the masqueradeIP based flows, we also need the podsubnet based flows for + // advertised networks since UDN pod to clusterIP is unSNATed and we need this traffic to be taken into + // the correct patch port of it's own network where it's a deadend if the clusterIP is not part of + // that UDN network and works if it is part of the UDN network. + dftFlows = append(dftFlows, + fmt.Sprintf("cookie=%s, priority=200, table=2, ip, ip_src=%s, "+ + "actions=drop", + nodetypes.DefaultOpenFlowCookie, matchingIPFamilySubnet.String())) } // Drop traffic coming from the masquerade IP or the UDN subnet(for advertised UDNs) to ensure that // isolation between networks is enforced. This handles the case where a pod on the UDN subnet is sending traffic to @@ -378,7 +382,7 @@ func (b *BridgeConfiguration) flowsForDefaultBridge(extraIPs []net.IP) ([]string dftFlows = append(dftFlows, fmt.Sprintf("cookie=%s, priority=200, table=2, ip, ip_src=%s, "+ "actions=drop", - nodetypes.DefaultOpenFlowCookie, srcIPOrSubnet)) + nodetypes.DefaultOpenFlowCookie, netConfig.V4MasqIPs.ManagementPort.IP.String())) dftFlows = append(dftFlows, fmt.Sprintf("cookie=%s, priority=250, table=2, ip, pkt_mark=%s, "+ @@ -393,7 +397,6 @@ func (b *BridgeConfiguration) flowsForDefaultBridge(extraIPs []net.IP) ([]string if netConfig.IsDefaultNetwork() { continue } - srcIPOrSubnet := netConfig.V6MasqIPs.ManagementPort.IP.String() if util.IsRouteAdvertisementsEnabled() && netConfig.Advertised.Load() { var udnAdvertisedSubnets []*net.IPNet for _, clusterEntry := range netConfig.Subnets { @@ -407,13 +410,15 @@ func (b *BridgeConfiguration) flowsForDefaultBridge(extraIPs []net.IP) ([]string continue } - // Use the filtered subnets for the flow compute instead of the masqueradeIP - srcIPOrSubnet = matchingIPFamilySubnet.String() + dftFlows = append(dftFlows, + fmt.Sprintf("cookie=%s, priority=200, table=2, ip6, ipv6_src=%s, "+ + "actions=drop", + nodetypes.DefaultOpenFlowCookie, matchingIPFamilySubnet.String())) } dftFlows = append(dftFlows, fmt.Sprintf("cookie=%s, priority=200, table=2, ip6, ipv6_src=%s, "+ "actions=drop", - nodetypes.DefaultOpenFlowCookie, srcIPOrSubnet)) + nodetypes.DefaultOpenFlowCookie, netConfig.V6MasqIPs.ManagementPort.IP.String())) dftFlows = append(dftFlows, fmt.Sprintf("cookie=%s, priority=250, table=2, ip6, pkt_mark=%s, "+ "actions=set_field:%s->eth_dst,output:%s", @@ -732,70 +737,94 @@ func (b *BridgeConfiguration) commonFlows(hostSubnets []*net.IPNet) ([]string, e "actions=ct(zone=%d, nat, table=1)", nodetypes.DefaultOpenFlowCookie, ofPortPhys, config.Default.ConntrackZone)) } } - // Egress IP is often configured on a node different from the one hosting the affected pod. - // Due to the fact that ovn-controllers on different nodes apply the changes independently, - // there is a chance that the pod traffic will reach the egress node before it configures the SNAT flows. - // Drop pod traffic that is not SNATed, excluding local pods(required for ICNIv2) - defaultNetConfig := b.netConfig[types.DefaultNetworkName] - if config.OVNKubernetesFeature.EnableEgressIP { - for _, clusterEntry := range config.Default.ClusterSubnets { - cidr := clusterEntry.CIDR - ipv := getIPv(cidr) - // table 0, drop packets coming from pods headed externally that were not SNATed. - dftFlows = append(dftFlows, - fmt.Sprintf("cookie=%s, priority=104, in_port=%s, %s, %s_src=%s, actions=drop", - nodetypes.DefaultOpenFlowCookie, defaultNetConfig.OfPortPatch, ipv, ipv, cidr)) - } - for _, subnet := range defaultNetConfig.NodeSubnets { - ipv := getIPv(subnet) - if ofPortPhys != "" { - // table 0, commit connections from local pods. - // ICNIv2 requires that local pod traffic can leave the node without SNAT. - dftFlows = append(dftFlows, - fmt.Sprintf("cookie=%s, priority=109, in_port=%s, dl_src=%s, %s, %s_src=%s"+ - "actions=ct(commit, zone=%d, exec(set_field:%s->ct_mark)), output:%s", - nodetypes.DefaultOpenFlowCookie, defaultNetConfig.OfPortPatch, bridgeMacAddress, ipv, ipv, subnet, - config.Default.ConntrackZone, nodetypes.CtMarkOVN, ofPortPhys)) - } - } - } - if ofPortPhys != "" { + defaultNetConfig := b.netConfig[types.DefaultNetworkName] + // table 0, Ingress/Egress flows for MEG enabled pods and advertised UDNs + // priority 300: Ingress traffic to MEG pods and advertised UDNs + // priority 301: Ingress traffic to node management traffic + // priority 104: Egress traffic from advertised UDNs or MEG enabled pods + // priority 103: For egressIP, drop packets coming from pods from other nodes headed externally that were not SNATed. + // example flows in SGW mode EIP enabled: + // table=0, n_packets=0, n_bytes=0, priority=300,ip,in_port=eth0,nw_dst= actions=output:4 + // table=0, n_packets=0, n_bytes=0, priority=301,ip,in_port=eth0,nw_dst= actions=output:LOCAL + // table=0, n_packets=0, n_bytes=0, priority=104,ip,in_port=4,dl_src=02:42:ac:12:00:03,nw_src= actions=output:eth0 + // table=0, n_packets=0, n_bytes=0, priority=103,ip,in_port=4,nw_src= actions=drop + // example flows in LGW mode EIP enabled: + // table=0, n_packets=0, n_bytes=0, priority=300,ip,in_port=eth0,nw_dst= actions=output:LOCAL + // table=0, n_packets=0, n_bytes=0, priority=104,ip,in_port=LOCAL,dl_src=02:42:ac:12:00:03,nw_src= actions=output:eth0 + // table=0, n_packets=0, n_bytes=0, priority=103,ip,in_port=4,nw_src= actions=drop + // example flows in SGW mode EIP disabled: + // table=0, n_packets=0, n_bytes=0, priority=300,ip,in_port=eth0,nw_dst= actions=output:4 + // table=0, n_packets=0, n_bytes=0, priority=301,ip,in_port=eth0,nw_dst= actions=output:LOCAL + // table=0, n_packets=0, n_bytes=0, priority=104,ip,in_port=4,dl_src=02:42:ac:12:00:03,nw_src= actions=output:eth0 + // example flows in LGW mode EIP disabled: + // table=0, n_packets=0, n_bytes=0, priority=300,ip,in_port=eth0,nw_dst= actions=output:LOCAL + // table=0, n_packets=0, n_bytes=0, priority=104,ip,in_port=LOCAL,dl_src=02:42:ac:12:00:03,nw_src= actions=output:eth0 for _, netConfig := range b.patchedNetConfigs() { isNetworkAdvertised := netConfig.Advertised.Load() // disableSNATMultipleGWs only applies to default network disableSNATMultipleGWs := netConfig.IsDefaultNetwork() && config.Gateway.DisableSNATMultipleGWs + + if config.OVNKubernetesFeature.EnableEgressIP { + // Due to the fact that ovn-controllers on different nodes apply the changes independently, + // there is a chance that the pod traffic will reach the egress node before it configures the SNAT flows. + // Drop pod traffic that is not SNATed + for _, clusterEntry := range netConfig.Subnets { + cidr := clusterEntry.CIDR + ipv := getIPv(cidr) + // table 0, drop packets coming from pods headed externally that were not SNATed. + dftFlows = append(dftFlows, + fmt.Sprintf("cookie=%s, priority=103, in_port=%s, %s, %s_src=%s, actions=drop", + nodetypes.DefaultOpenFlowCookie, netConfig.OfPortPatch, ipv, ipv, cidr)) + } + } + // skip if MEG is disabled for the default network + // and the network (default or UDN) is not advertised if !disableSNATMultipleGWs && !isNetworkAdvertised { continue } output := netConfig.OfPortPatch - if isNetworkAdvertised && config.Gateway.Mode == config.GatewayModeLocal { + input := netConfig.OfPortPatch + isAdvertisedLGW := isNetworkAdvertised && config.Gateway.Mode == config.GatewayModeLocal + if isAdvertisedLGW { // except if advertised through BGP, go to kernel // TODO: MEG enabled pods should still go through the patch port // but holding this until // https://issues.redhat.com/browse/FDP-646 is fixed, for now we // are assuming MEG & BGP are not used together output = nodetypes.OvsLocalPort + input = nodetypes.OvsLocalPort } - for _, clusterEntry := range netConfig.Subnets { - cidr := clusterEntry.CIDR - ipv := getIPv(cidr) + for _, subnet := range netConfig.NodeSubnets { + ipv := getIPv(subnet) dftFlows = append(dftFlows, - fmt.Sprintf("cookie=%s, priority=15, table=1, %s, %s_dst=%s, "+ + fmt.Sprintf("cookie=%s, priority=300, table=0, in_port=%s, %s, %s_dst=%s, "+ "actions=output:%s", - nodetypes.DefaultOpenFlowCookie, ipv, ipv, cidr, output)) - } - if output == netConfig.OfPortPatch { + nodetypes.DefaultOpenFlowCookie, ofPortPhys, ipv, ipv, subnet, output)) // except node management traffic - for _, subnet := range netConfig.NodeSubnets { - mgmtIP := util.GetNodeManagementIfAddr(subnet) - ipv := getIPv(mgmtIP) + mgmtIP := util.GetNodeManagementIfAddr(subnet) + if mgmtIP == nil { + return nil, fmt.Errorf("unable to determine management IP for subnet %s", subnet.String()) + } + if config.Gateway.Mode != config.GatewayModeLocal { dftFlows = append(dftFlows, - fmt.Sprintf("cookie=%s, priority=16, table=1, %s, %s_dst=%s, "+ + fmt.Sprintf("cookie=%s, priority=301, table=0, in_port=%s, %s, %s_dst=%s, "+ "actions=output:%s", - nodetypes.DefaultOpenFlowCookie, ipv, ipv, mgmtIP.IP, nodetypes.OvsLocalPort), + nodetypes.DefaultOpenFlowCookie, ofPortPhys, ipv, ipv, mgmtIP.IP, nodetypes.OvsLocalPort), ) } + + if (disableSNATMultipleGWs && config.OVNKubernetesFeature.EnableEgressIP) || isNetworkAdvertised { + // MEG and advertised UDN networks requires that local pod traffic can leave the node without SNAT. + // We match on the pod subnets and forward the traffic to the physical interface. + // Select priority 104 for the scenario when both EgressIP and advertised UDN are active: + // 1. Override egressIP drop flows (priority 103) + // 2. Still allow egressIP flows at priority 105 + dftFlows = append(dftFlows, + fmt.Sprintf("cookie=%s, priority=104, in_port=%s, dl_src=%s, %s, %s_src=%s, "+ + "actions=output:%s", + nodetypes.DefaultOpenFlowCookie, input, bridgeMacAddress, ipv, ipv, subnet, ofPortPhys)) + } } } diff --git a/go-controller/pkg/node/default_node_network_controller.go b/go-controller/pkg/node/default_node_network_controller.go index f1281980a8..47ba8f6262 100644 --- a/go-controller/pkg/node/default_node_network_controller.go +++ b/go-controller/pkg/node/default_node_network_controller.go @@ -188,7 +188,7 @@ func NewDefaultNodeNetworkController(cnnci *CommonNodeNetworkControllerInfo, net nc.initRetryFrameworkForNode() - err = setupPMTUDNFTSets() + err = setupRemoteNodeNFTSets() if err != nil { return nil, fmt.Errorf("failed to setup PMTUD nftables sets: %w", err) } @@ -1515,25 +1515,34 @@ func (nc *DefaultNodeNetworkController) WatchNodes() error { func (nc *DefaultNodeNetworkController) addOrUpdateNode(node *corev1.Node) error { var nftElems []*knftables.Element var addrs []string - for _, address := range node.Status.Addresses { - if address.Type != corev1.NodeInternalIP { - continue - } - nodeIP := net.ParseIP(address.Address) - if nodeIP == nil { - continue - } + // Use GetNodeAddresses to get all node IPs (including current node for openflow) + ipsv4, ipsv6, err := util.GetNodeAddresses(config.IPv4Mode, config.IPv6Mode, node) + if err != nil { + return fmt.Errorf("failed to get node addresses for node %q: %w", node.Name, err) + } + + // Process IPv4 addresses + for _, nodeIP := range ipsv4 { addrs = append(addrs, nodeIP.String()) klog.Infof("Adding remote node %q, IP: %s to PMTUD blocking rules", node.Name, nodeIP) - if utilnet.IsIPv4(nodeIP) { + // Only add to nftables if this is remote node + if node.Name != nc.name { nftElems = append(nftElems, &knftables.Element{ - Set: types.NFTNoPMTUDRemoteNodeIPsv4, + Set: types.NFTRemoteNodeIPsv4, Key: []string{nodeIP.String()}, }) - } else { + } + } + + // Process IPv6 addresses + for _, nodeIP := range ipsv6 { + addrs = append(addrs, nodeIP.String()) + klog.Infof("Adding remote node %q, IP: %s to PMTUD blocking rules", node.Name, nodeIP) + // Only add to nftables if this is remote node + if node.Name != nc.name { nftElems = append(nftElems, &knftables.Element{ - Set: types.NFTNoPMTUDRemoteNodeIPsv6, + Set: types.NFTRemoteNodeIPsv6, Key: []string{nodeIP.String()}, }) } @@ -1557,12 +1566,12 @@ func removePMTUDNodeNFTRules(nodeIPs []net.IP) error { // Remove IPs from NFT sets if utilnet.IsIPv4(nodeIP) { nftElems = append(nftElems, &knftables.Element{ - Set: types.NFTNoPMTUDRemoteNodeIPsv4, + Set: types.NFTRemoteNodeIPsv4, Key: []string{nodeIP.String()}, }) } else { nftElems = append(nftElems, &knftables.Element{ - Set: types.NFTNoPMTUDRemoteNodeIPsv6, + Set: types.NFTRemoteNodeIPsv6, Key: []string{nodeIP.String()}, }) } @@ -1578,18 +1587,18 @@ func removePMTUDNodeNFTRules(nodeIPs []net.IP) error { func (nc *DefaultNodeNetworkController) deleteNode(node *corev1.Node) { gw := nc.Gateway.(*gateway) gw.openflowManager.deleteFlowsByKey(getPMTUDKey(node.Name)) - ipsToRemove := make([]net.IP, 0) - for _, address := range node.Status.Addresses { - if address.Type != corev1.NodeInternalIP { - continue - } - nodeIP := net.ParseIP(address.Address) - if nodeIP == nil { - continue - } - ipsToRemove = append(ipsToRemove, nodeIP) + + // Use GetNodeAddresses to get node IPs + ipsv4, ipsv6, err := util.GetNodeAddresses(config.IPv4Mode, config.IPv6Mode, node) + if err != nil { + klog.Errorf("Failed to get node addresses for node %q: %v", node.Name, err) + return } + ipsToRemove := make([]net.IP, 0, len(ipsv4)+len(ipsv6)) + ipsToRemove = append(ipsToRemove, ipsv4...) + ipsToRemove = append(ipsToRemove, ipsv6...) + klog.Infof("Deleting NFT elements for node: %s", node.Name) if err := removePMTUDNodeNFTRules(ipsToRemove); err != nil { klog.Errorf("Failed to delete nftables rules for PMTUD blocking for node %q: %v", node.Name, err) @@ -1610,33 +1619,34 @@ func (nc *DefaultNodeNetworkController) syncNodes(objs []interface{}) error { if node.Name == nc.name { continue } - for _, address := range node.Status.Addresses { - if address.Type != corev1.NodeInternalIP { - continue - } - nodeIP := net.ParseIP(address.Address) - if nodeIP == nil { - continue - } - // Remove IPs from NFT sets - if utilnet.IsIPv4(nodeIP) { - keepNFTSetElemsV4 = append(keepNFTSetElemsV4, &knftables.Element{ - Set: types.NFTNoPMTUDRemoteNodeIPsv4, - Key: []string{nodeIP.String()}, - }) - } else { - keepNFTSetElemsV6 = append(keepNFTSetElemsV6, &knftables.Element{ - Set: types.NFTNoPMTUDRemoteNodeIPsv6, - Key: []string{nodeIP.String()}, - }) - } + // Use GetNodeAddresses to get node IPs + ipsv4, ipsv6, err := util.GetNodeAddresses(config.IPv4Mode, config.IPv6Mode, node) + if err != nil { + klog.Errorf("Failed to get node addresses for node %q: %v", node.Name, err) + continue + } + + // Process IPv4 addresses + for _, nodeIP := range ipsv4 { + keepNFTSetElemsV4 = append(keepNFTSetElemsV4, &knftables.Element{ + Set: types.NFTRemoteNodeIPsv4, + Key: []string{nodeIP.String()}, + }) + } + + // Process IPv6 addresses + for _, nodeIP := range ipsv6 { + keepNFTSetElemsV6 = append(keepNFTSetElemsV6, &knftables.Element{ + Set: types.NFTRemoteNodeIPsv6, + Key: []string{nodeIP.String()}, + }) } } - if err := recreateNFTSet(types.NFTNoPMTUDRemoteNodeIPsv4, keepNFTSetElemsV4); err != nil { + if err := recreateNFTSet(types.NFTRemoteNodeIPsv4, keepNFTSetElemsV4); err != nil { errors = append(errors, err) } - if err := recreateNFTSet(types.NFTNoPMTUDRemoteNodeIPsv6, keepNFTSetElemsV6); err != nil { + if err := recreateNFTSet(types.NFTRemoteNodeIPsv6, keepNFTSetElemsV6); err != nil { errors = append(errors, err) } diff --git a/go-controller/pkg/node/default_node_network_controller_test.go b/go-controller/pkg/node/default_node_network_controller_test.go index a1413a7dd1..366ee881d6 100644 --- a/go-controller/pkg/node/default_node_network_controller_test.go +++ b/go-controller/pkg/node/default_node_network_controller_test.go @@ -38,18 +38,18 @@ import ( const v4PMTUDNFTRules = ` add table inet ovn-kubernetes -add rule inet ovn-kubernetes no-pmtud ip daddr @no-pmtud-remote-node-ips-v4 meta l4proto icmp icmp type 3 icmp code 4 counter drop +add rule inet ovn-kubernetes no-pmtud ip daddr @remote-node-ips-v4 meta l4proto icmp icmp type 3 icmp code 4 counter drop add chain inet ovn-kubernetes no-pmtud { type filter hook output priority 0 ; comment "Block egress needs frag/packet too big to remote k8s nodes" ; } -add set inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { type ipv4_addr ; comment "Block egress ICMP needs frag to remote Kubernetes nodes" ; } -add set inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { type ipv6_addr ; comment "Block egress ICMPv6 packet too big to remote Kubernetes nodes" ; } +add set inet ovn-kubernetes remote-node-ips-v4 { type ipv4_addr ; comment "Block egress ICMP needs frag to remote Kubernetes nodes" ; } +add set inet ovn-kubernetes remote-node-ips-v6 { type ipv6_addr ; comment "Block egress ICMPv6 packet too big to remote Kubernetes nodes" ; } ` const v6PMTUDNFTRules = ` add table inet ovn-kubernetes -add rule inet ovn-kubernetes no-pmtud meta l4proto icmpv6 icmpv6 type 2 icmpv6 code 0 ip6 daddr @no-pmtud-remote-node-ips-v6 counter drop +add rule inet ovn-kubernetes no-pmtud meta l4proto icmpv6 icmpv6 type 2 icmpv6 code 0 ip6 daddr @remote-node-ips-v6 counter drop add chain inet ovn-kubernetes no-pmtud { type filter hook output priority 0 ; comment "Block egress needs frag/packet too big to remote k8s nodes" ; } -add set inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { type ipv4_addr ; comment "Block egress ICMP needs frag to remote Kubernetes nodes" ; } -add set inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { type ipv6_addr ; comment "Block egress ICMPv6 packet too big to remote Kubernetes nodes" ; } +add set inet ovn-kubernetes remote-node-ips-v4 { type ipv4_addr ; comment "Block egress ICMP needs frag to remote Kubernetes nodes" ; } +add set inet ovn-kubernetes remote-node-ips-v6 { type ipv6_addr ; comment "Block egress ICMPv6 packet too big to remote Kubernetes nodes" ; } ` var _ = Describe("Node", func() { @@ -755,6 +755,9 @@ var _ = Describe("Node", func() { node := corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: nodeName, + Annotations: map[string]string{ + util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", nodeIP+"/24"), + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -769,6 +772,9 @@ var _ = Describe("Node", func() { otherNode := corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: remoteNodeName, + Annotations: map[string]string{ + util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", otherNodeIP+"/24"), + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -806,7 +812,7 @@ var _ = Describe("Node", func() { cnnci := NewCommonNodeNetworkControllerInfo(kubeFakeClient, fakeClient.AdminPolicyRouteClient, wf, nil, nodeName, routeManager) nc = newDefaultNodeNetworkController(cnnci, stop, wg, routeManager, nil) nc.initRetryFrameworkForNode() - err = setupPMTUDNFTSets() + err = setupRemoteNodeNFTSets() Expect(err).NotTo(HaveOccurred()) err = setupPMTUDNFTChain() Expect(err).NotTo(HaveOccurred()) @@ -830,7 +836,7 @@ var _ = Describe("Node", func() { err = nc.WatchNodes() Expect(err).NotTo(HaveOccurred()) nftRules := v4PMTUDNFTRules + ` -add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.254.61 } +add element inet ovn-kubernetes remote-node-ips-v4 { 169.254.254.61 } ` err = nodenft.MatchNFTRules(nftRules, nft.Dump()) Expect(err).NotTo(HaveOccurred()) @@ -860,6 +866,9 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.254.61 } node := corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: nodeName, + Annotations: map[string]string{ + util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", nodeIP+"/24"), + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -874,6 +883,9 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.254.61 } otherNode := corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: remoteNodeName, + Annotations: map[string]string{ + util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", otherSubnetNodeIP+"/24"), + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -911,7 +923,7 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.254.61 } cnnci := NewCommonNodeNetworkControllerInfo(kubeFakeClient, fakeClient.AdminPolicyRouteClient, wf, nil, nodeName, routeManager) nc = newDefaultNodeNetworkController(cnnci, stop, wg, routeManager, nil) nc.initRetryFrameworkForNode() - err = setupPMTUDNFTSets() + err = setupRemoteNodeNFTSets() Expect(err).NotTo(HaveOccurred()) err = setupPMTUDNFTChain() Expect(err).NotTo(HaveOccurred()) @@ -935,7 +947,7 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.254.61 } err = nc.WatchNodes() Expect(err).NotTo(HaveOccurred()) nftRules := v4PMTUDNFTRules + ` -add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.253.61 } +add element inet ovn-kubernetes remote-node-ips-v4 { 169.254.253.61 } ` err = nodenft.MatchNFTRules(nftRules, nft.Dump()) Expect(err).NotTo(HaveOccurred()) @@ -1007,6 +1019,9 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.253.61 } node := corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: nodeName, + Annotations: map[string]string{ + util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", nodeIP+"/64"), + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -1021,6 +1036,9 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.253.61 } otherNode := corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: remoteNodeName, + Annotations: map[string]string{ + util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", otherNodeIP+"/64"), + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -1058,7 +1076,7 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.253.61 } cnnci := NewCommonNodeNetworkControllerInfo(kubeFakeClient, fakeClient.AdminPolicyRouteClient, wf, nil, nodeName, routeManager) nc = newDefaultNodeNetworkController(cnnci, stop, wg, routeManager, nil) nc.initRetryFrameworkForNode() - err = setupPMTUDNFTSets() + err = setupRemoteNodeNFTSets() Expect(err).NotTo(HaveOccurred()) err = setupPMTUDNFTChain() Expect(err).NotTo(HaveOccurred()) @@ -1082,7 +1100,7 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v4 { 169.254.253.61 } err = nc.WatchNodes() Expect(err).NotTo(HaveOccurred()) nftRules := v6PMTUDNFTRules + ` -add element inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { 2001:db8:1::4 } +add element inet ovn-kubernetes remote-node-ips-v6 { 2001:db8:1::4 } ` err = nodenft.MatchNFTRules(nftRules, nft.Dump()) Expect(err).NotTo(HaveOccurred()) @@ -1111,6 +1129,9 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { 2001:db8:1::4 } node := corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: nodeName, + Annotations: map[string]string{ + util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", nodeIP+"/64"), + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -1125,6 +1146,9 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { 2001:db8:1::4 } otherNode := corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: remoteNodeName, + Annotations: map[string]string{ + util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", otherSubnetNodeIP+"/64"), + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -1162,7 +1186,7 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { 2001:db8:1::4 } cnnci := NewCommonNodeNetworkControllerInfo(kubeFakeClient, fakeClient.AdminPolicyRouteClient, wf, nil, nodeName, routeManager) nc = newDefaultNodeNetworkController(cnnci, stop, wg, routeManager, nil) nc.initRetryFrameworkForNode() - err = setupPMTUDNFTSets() + err = setupRemoteNodeNFTSets() Expect(err).NotTo(HaveOccurred()) err = setupPMTUDNFTChain() Expect(err).NotTo(HaveOccurred()) @@ -1186,7 +1210,7 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { 2001:db8:1::4 } err = nc.WatchNodes() Expect(err).NotTo(HaveOccurred()) nftRules := v6PMTUDNFTRules + ` -add element inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { 2002:db8:1::4 } +add element inet ovn-kubernetes remote-node-ips-v6 { 2002:db8:1::4 } ` err = nodenft.MatchNFTRules(nftRules, nft.Dump()) Expect(err).NotTo(HaveOccurred()) @@ -1323,7 +1347,7 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { 2002:db8:1::4 } cnnci := NewCommonNodeNetworkControllerInfo(kubeFakeClient, fakeClient.AdminPolicyRouteClient, wf, nil, nodeName, routeManager) nc = newDefaultNodeNetworkController(cnnci, stop, wg, routeManager, nil) nc.initRetryFrameworkForNode() - err = setupPMTUDNFTSets() + err = setupRemoteNodeNFTSets() Expect(err).NotTo(HaveOccurred()) err = setupPMTUDNFTChain() Expect(err).NotTo(HaveOccurred()) @@ -1444,7 +1468,7 @@ add element inet ovn-kubernetes no-pmtud-remote-node-ips-v6 { 2002:db8:1::4 } cnnci := NewCommonNodeNetworkControllerInfo(kubeFakeClient, fakeClient.AdminPolicyRouteClient, wf, nil, nodeName, routeManager) nc = newDefaultNodeNetworkController(cnnci, stop, wg, routeManager, nil) nc.initRetryFrameworkForNode() - err = setupPMTUDNFTSets() + err = setupRemoteNodeNFTSets() Expect(err).NotTo(HaveOccurred()) err = setupPMTUDNFTChain() Expect(err).NotTo(HaveOccurred()) diff --git a/go-controller/pkg/node/gateway.go b/go-controller/pkg/node/gateway.go index 9b43fc95a5..fa812377e7 100644 --- a/go-controller/pkg/node/gateway.go +++ b/go-controller/pkg/node/gateway.go @@ -521,9 +521,9 @@ func (g *gateway) addAllServices() []error { func (g *gateway) updateSNATRules() error { subnets := util.IPsToNetworkIPs(g.nodeIPManager.mgmtPort.GetAddresses()...) - if g.GetDefaultPodNetworkAdvertised() || config.Gateway.Mode != config.GatewayModeLocal { - return delLocalGatewayPodSubnetNATRules(subnets...) + if config.Gateway.Mode != config.GatewayModeLocal { + return delLocalGatewayPodSubnetNFTRules() } - return addLocalGatewayPodSubnetNATRules(subnets...) + return addOrUpdateLocalGatewayPodSubnetNFTRules(g.GetDefaultPodNetworkAdvertised(), subnets...) } diff --git a/go-controller/pkg/node/gateway_init_linux_test.go b/go-controller/pkg/node/gateway_init_linux_test.go index 79886dbf38..7abfb4ed8d 100644 --- a/go-controller/pkg/node/gateway_init_linux_test.go +++ b/go-controller/pkg/node/gateway_init_linux_test.go @@ -80,6 +80,17 @@ add chain inet ovn-kubernetes udn-service-prerouting { type filter hook prerouti add rule inet ovn-kubernetes udn-service-prerouting iifname != %s jump udn-service-mark add chain inet ovn-kubernetes udn-service-output { type filter hook output priority -150 ; comment "UDN services packet mark - Output" ; } add rule inet ovn-kubernetes udn-service-output jump udn-service-mark +add chain inet ovn-kubernetes ovn-kube-udn-masq { comment "OVN UDN masquerade" ; } +add rule inet ovn-kubernetes ovn-kube-udn-masq ip saddr != 169.254.169.0/29 ip daddr != 172.16.1.0/24 ip saddr 169.254.169.0/24 masquerade +add rule inet ovn-kubernetes ovn-kube-local-gw-masq jump ovn-kube-udn-masq +` + +const baseLGWNFTablesRules = ` +add rule inet ovn-kubernetes ovn-kube-local-gw-masq ip saddr 169.254.169.1 masquerade +add chain inet ovn-kubernetes ovn-kube-local-gw-masq { type nat hook postrouting priority 101 ; comment "OVN local gateway masquerade" ; } +add rule inet ovn-kubernetes ovn-kube-local-gw-masq jump ovn-kube-pod-subnet-masq +add rule inet ovn-kubernetes ovn-kube-pod-subnet-masq ip saddr 10.1.1.0/24 masquerade +add chain inet ovn-kubernetes ovn-kube-pod-subnet-masq ` func getBaseNFTRules(mgmtPort string) string { @@ -90,6 +101,10 @@ func getBaseNFTRules(mgmtPort string) string { return ret } +func getBaseLGWNFTablesRules(mgmtPort string) string { + return getBaseNFTRules(mgmtPort) + baseLGWNFTablesRules +} + func shareGatewayInterfaceTest(app *cli.App, testNS ns.NetNS, eth0Name, eth0MAC, eth0GWIP, eth0CIDR string, gatewayVLANID uint, l netlink.Link, hwOffload, setNodeIP bool) { const mtu string = "1234" @@ -1358,10 +1373,6 @@ OFPT_GET_CONFIG_REPLY (xid=0x4): frags=normal miss_send_len=0` "OVN-KUBE-EXTERNALIP": []string{ fmt.Sprintf("-p %s -d %s --dport %v -j DNAT --to-destination %s:%v", service.Spec.Ports[0].Protocol, externalIP, service.Spec.Ports[0].Port, service.Spec.ClusterIP, service.Spec.Ports[0].Port), }, - "POSTROUTING": []string{ - "-s 169.254.169.1 -j MASQUERADE", - "-s 10.1.1.0/24 -j MASQUERADE", - }, "OVN-KUBE-ETP": []string{}, "OVN-KUBE-ITP": []string{}, }, @@ -1395,16 +1406,6 @@ OFPT_GET_CONFIG_REPLY (xid=0x4): frags=normal miss_send_len=0` expectedTables["filter"]["FORWARD"] = append(expectedMCSRules, expectedTables["filter"]["FORWARD"]...) expectedTables["filter"]["OUTPUT"] = append(expectedMCSRules, expectedTables["filter"]["OUTPUT"]...) // END OCP HACK - if util.IsNetworkSegmentationSupportEnabled() { - expectedTables["nat"]["POSTROUTING"] = append(expectedTables["nat"]["POSTROUTING"], - "-j OVN-KUBE-UDN-MASQUERADE", - ) - expectedTables["nat"]["OVN-KUBE-UDN-MASQUERADE"] = append(expectedTables["nat"]["OVN-KUBE-UDN-MASQUERADE"], - "-s 169.254.169.0/29 -j RETURN", // this guarantees we don't SNAT default network masqueradeIPs - "-d 172.16.1.0/24 -j RETURN", // this guarantees we don't SNAT service traffic - "-s 169.254.169.0/24 -j MASQUERADE", // this guarantees we SNAT all UDN MasqueradeIPs traffic leaving the node - ) - } f4 := iptV4.(*util.FakeIPTables) err = f4.MatchState(expectedTables, map[util.FakePolicyKey]string{{ Table: "filter", @@ -1421,7 +1422,7 @@ OFPT_GET_CONFIG_REPLY (xid=0x4): frags=normal miss_send_len=0` err = f6.MatchState(expectedTables, nil) Expect(err).NotTo(HaveOccurred()) - expectedNFT := getBaseNFTRules(types.K8sMgmtIntfName) + expectedNFT := getBaseLGWNFTablesRules(types.K8sMgmtIntfName) err = nodenft.MatchNFTRules(expectedNFT, nft.Dump()) Expect(err).NotTo(HaveOccurred()) diff --git a/go-controller/pkg/node/gateway_iptables.go b/go-controller/pkg/node/gateway_iptables.go index e9b6b12387..90bffbe91f 100644 --- a/go-controller/pkg/node/gateway_iptables.go +++ b/go-controller/pkg/node/gateway_iptables.go @@ -21,11 +21,10 @@ import ( ) const ( - iptableNodePortChain = "OVN-KUBE-NODEPORT" // called from nat-PREROUTING and nat-OUTPUT - iptableExternalIPChain = "OVN-KUBE-EXTERNALIP" // called from nat-PREROUTING and nat-OUTPUT - iptableETPChain = "OVN-KUBE-ETP" // called from nat-PREROUTING only - iptableITPChain = "OVN-KUBE-ITP" // called from mangle-OUTPUT and nat-OUTPUT - iptableUDNMasqueradeChain = "OVN-KUBE-UDN-MASQUERADE" // called from nat-POSTROUTING + iptableNodePortChain = "OVN-KUBE-NODEPORT" // called from nat-PREROUTING and nat-OUTPUT + iptableExternalIPChain = "OVN-KUBE-EXTERNALIP" // called from nat-PREROUTING and nat-OUTPUT + iptableETPChain = "OVN-KUBE-ETP" // called from nat-PREROUTING only + iptableITPChain = "OVN-KUBE-ITP" // called from mangle-OUTPUT and nat-OUTPUT ) func clusterIPTablesProtocols() []iptables.Protocol { @@ -69,29 +68,11 @@ func restoreIptRulesFiltered(rules []nodeipt.Rule, filter map[string]map[string] return nodeipt.RestoreRulesFiltered(rules, filter) } -// appendIptRules adds the provided rules in an append fashion -// i.e each rule gets added at the last position in the chain -func appendIptRules(rules []nodeipt.Rule) error { - return nodeipt.AddRules(rules, true) -} - // deleteIptRules removes provided rules from the chain func deleteIptRules(rules []nodeipt.Rule) error { return nodeipt.DelRules(rules) } -// ensureChain ensures that a chain exists within a table -func ensureChain(table, chain string) error { - for _, proto := range clusterIPTablesProtocols() { - ipt, err := util.GetIPTablesHelper(proto) - if err != nil { - return fmt.Errorf("failed to get IPTables helper to add UDN chain: %v", err) - } - addChaintoTable(ipt, table, chain) - } - return nil -} - func getGatewayInitRules(chain string, proto iptables.Protocol) []nodeipt.Rule { iptRules := []nodeipt.Rule{} if chain == iptableITPChain { @@ -403,123 +384,8 @@ func getLocalGatewayFilterRules(ifname string, cidr *net.IPNet) []nodeipt.Rule { } } -func getLocalGatewayPodSubnetNATRules(cidr *net.IPNet) []nodeipt.Rule { - protocol := getIPTablesProtocol(cidr.IP.String()) - return []nodeipt.Rule{ - { - Table: "nat", - Chain: "POSTROUTING", - Args: []string{ - "-s", cidr.String(), - "-j", "MASQUERADE", - }, - Protocol: protocol, - }, - } -} - -// getUDNMasqueradeRules is only called for local-gateway-mode -func getUDNMasqueradeRules(protocol iptables.Protocol) []nodeipt.Rule { - // the following rules are actively used only for the UDN Feature: - // -A POSTROUTING -j OVN-KUBE-UDN-MASQUERADE - // -A OVN-KUBE-UDN-MASQUERADE -s 169.254.0.0/29 -j RETURN - // -A OVN-KUBE-UDN-MASQUERADE -d 10.96.0.0/16 -j RETURN - // -A OVN-KUBE-UDN-MASQUERADE -s 169.254.0.0/17 -j MASQUERADE - // NOTE: Ordering is important here, the RETURN must come before - // the MASQUERADE rule. Please don't change the ordering. - srcUDNMasqueradePrefix := config.Gateway.V4MasqueradeSubnet - ipFamily := utilnet.IPv4 - if protocol == iptables.ProtocolIPv6 { - srcUDNMasqueradePrefix = config.Gateway.V6MasqueradeSubnet - ipFamily = utilnet.IPv6 - } - // defaultNetworkReservedMasqueradePrefix contains the first 6 IPs in the - // masquerade range that shouldn't be masqueraded. Hence it's always 3 bits (8 - // IPs) wide, regardless of IP family. - _, ipnet, _ := net.ParseCIDR(srcUDNMasqueradePrefix) - _, len := ipnet.Mask.Size() - defaultNetworkReservedMasqueradePrefix := fmt.Sprintf("%s/%d", ipnet.IP.String(), len-3) - - rules := []nodeipt.Rule{ - { - Table: "nat", - Chain: "POSTROUTING", - Args: []string{"-j", iptableUDNMasqueradeChain}, // NOTE: AddRules will take care of creating the chain - Protocol: protocol, - }, - { - Table: "nat", - Chain: iptableUDNMasqueradeChain, - Args: []string{ - "-s", defaultNetworkReservedMasqueradePrefix, - "-j", "RETURN", - }, - Protocol: protocol, - }, - } - for _, svcCIDR := range config.Kubernetes.ServiceCIDRs { - if utilnet.IPFamilyOfCIDR(svcCIDR) != ipFamily { - continue - } - rules = append(rules, - nodeipt.Rule{ - Table: "nat", - Chain: iptableUDNMasqueradeChain, - Args: []string{ - "-d", svcCIDR.String(), - "-j", "RETURN", - }, - Protocol: protocol, - }, - ) - } - rules = append(rules, - nodeipt.Rule{ - Table: "nat", - Chain: iptableUDNMasqueradeChain, - Args: []string{ - "-s", srcUDNMasqueradePrefix, - "-j", "MASQUERADE", - }, - Protocol: protocol, - }, - ) - return rules -} - -func getLocalGatewayNATRules(cidr *net.IPNet) []nodeipt.Rule { - // Allow packets to/from the gateway interface in case defaults deny - protocol := getIPTablesProtocol(cidr.IP.String()) - masqueradeIP := config.Gateway.MasqueradeIPs.V4OVNMasqueradeIP - if protocol == iptables.ProtocolIPv6 { - masqueradeIP = config.Gateway.MasqueradeIPs.V6OVNMasqueradeIP - } - rules := append( - []nodeipt.Rule{ - { - Table: "nat", - Chain: "POSTROUTING", - Args: []string{ - "-s", masqueradeIP.String(), - "-j", "MASQUERADE", - }, - Protocol: protocol, - }, - }, - getLocalGatewayPodSubnetNATRules(cidr)..., - ) - - // FIXME(tssurya): If the feature is disabled we should be removing - // these rules - if util.IsNetworkSegmentationSupportEnabled() { - rules = append(rules, getUDNMasqueradeRules(protocol)...) - } - - return rules -} - -// initLocalGatewayNATRules sets up iptables rules for interfaces -func initLocalGatewayNATRules(ifname string, cidr *net.IPNet) error { +// initLocalGatewayIPTFilterRules sets up iptables rules for interfaces +func initLocalGatewayIPTFilterRules(ifname string, cidr *net.IPNet) error { // Insert the filter table rules because they need to be evaluated BEFORE the DROP rules // we have for forwarding. DO NOT change the ordering; specially important // during SGW->LGW rollouts and restarts. @@ -527,25 +393,8 @@ func initLocalGatewayNATRules(ifname string, cidr *net.IPNet) error { if err != nil { return fmt.Errorf("unable to insert forwarding rules %v", err) } - // append the masquerade rules in POSTROUTING table since that needs to be - // evaluated last. - return appendIptRules(getLocalGatewayNATRules(cidr)) -} - -func addLocalGatewayPodSubnetNATRules(cidrs ...*net.IPNet) error { - var rules []nodeipt.Rule - for _, cidr := range cidrs { - rules = append(rules, getLocalGatewayPodSubnetNATRules(cidr)...) - } - return appendIptRules(rules) -} - -func delLocalGatewayPodSubnetNATRules(cidrs ...*net.IPNet) error { - var rules []nodeipt.Rule - for _, cidr := range cidrs { - rules = append(rules, getLocalGatewayPodSubnetNATRules(cidr)...) - } - return deleteIptRules(rules) + // NOTE: nftables masquerade rules are now handled separately in initLocalGatewayNFTNATRules + return nil } func addChaintoTable(ipt util.IPTablesHelper, tableName, chain string) { diff --git a/go-controller/pkg/node/gateway_localnet.go b/go-controller/pkg/node/gateway_localnet.go index e0cc822844..6b8ed9aa0b 100644 --- a/go-controller/pkg/node/gateway_localnet.go +++ b/go-controller/pkg/node/gateway_localnet.go @@ -17,11 +17,11 @@ import ( func initLocalGateway(hostSubnets []*net.IPNet, mgmtPort managementport.Interface) error { klog.Info("Adding iptables masquerading rules for new local gateway") - if util.IsNetworkSegmentationSupportEnabled() { - if err := ensureChain("nat", iptableUDNMasqueradeChain); err != nil { - return fmt.Errorf("failed to ensure chain %s in NAT table: %w", iptableUDNMasqueradeChain, err) - } - } + + var allCIDRs []*net.IPNet + ifName := mgmtPort.GetInterfaceName() + + // First pass: collect all CIDRs and setup iptables filter rules per interface for _, hostSubnet := range hostSubnets { // local gateway mode uses mp0 as default path for all ingress traffic into OVN nextHop, err := util.MatchFirstIPNetFamily(utilnet.IsIPv6CIDR(hostSubnet), mgmtPort.GetAddresses()) @@ -32,11 +32,21 @@ func initLocalGateway(hostSubnets []*net.IPNet, mgmtPort managementport.Interfac // add iptables masquerading for mp0 to exit the host for egress cidr := nextHop.IP.Mask(nextHop.Mask) cidrNet := &net.IPNet{IP: cidr, Mask: nextHop.Mask} - ifName := mgmtPort.GetInterfaceName() - if err := initLocalGatewayNATRules(ifName, cidrNet); err != nil { + allCIDRs = append(allCIDRs, cidrNet) + + // Setup iptables filter rules for this interface/CIDR + if err := initLocalGatewayIPTFilterRules(ifName, cidrNet); err != nil { return fmt.Errorf("failed to add local NAT rules for: %s, err: %v", ifName, err) } } + + // setup nftables masquerade rules for all CIDRs (v4, v6 or dualstack) + if len(allCIDRs) > 0 { + if err := initLocalGatewayNFTNATRules(allCIDRs...); err != nil { + return fmt.Errorf("failed to setup nftables masquerade rules: %w", err) + } + } + return nil } diff --git a/go-controller/pkg/node/gateway_nftables.go b/go-controller/pkg/node/gateway_nftables.go index 842bb417d1..b38f2baebb 100644 --- a/go-controller/pkg/node/gateway_nftables.go +++ b/go-controller/pkg/node/gateway_nftables.go @@ -6,12 +6,14 @@ package node import ( "context" "fmt" + "net" "strings" corev1 "k8s.io/api/core/v1" utilnet "k8s.io/utils/net" "sigs.k8s.io/knftables" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/bridgeconfig" nodenft "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/nftables" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" @@ -27,6 +29,13 @@ import ( // use an "accept" rule to override a later "drop" rule), then those rules will need to // either both be iptables or both be nftables. +// nftables chain names +const ( + nftablesLocalGatewayMasqChain = "ovn-kube-local-gw-masq" + nftablesPodSubnetMasqChain = "ovn-kube-pod-subnet-masq" + nftablesUDNMasqChain = "ovn-kube-udn-masq" +) + // getNoSNATNodePortRules returns elements to add to the "mgmtport-no-snat-nodeports" // set to prevent SNAT of sourceIP when passing through the management port, for an // `externalTrafficPolicy: Local` service with NodePorts. @@ -186,3 +195,320 @@ func getUDNNFTRules(service *corev1.Service, netConfig *bridgeconfig.BridgeUDNCo } return rules } + +// getLocalGatewayPodSubnetMasqueradeNFTRule creates a rule for masquerading traffic from the pod subnet CIDR +// in local gateway node in a seperate chain which is then called from local gateway masquerade chain. +// +// chain ovn-kube-pod-subnet-masq { +// ip saddr 10.244.0.0/24 masquerade +// ip6 saddr fd00:10:244:1::/64 masquerade +// } +// +// If isAdvertisedNetwork is true, masquerade only when destination matches remote node IPs. +// Rules look like: +// ip saddr 10.244.0.0/24 ip daddr @remote-node-ips-v4 masquerade +// ip6 saddr fd00:10:244:1::/64 ip6 daddr @remote-node-ips-v6 masquerade +func getLocalGatewayPodSubnetMasqueradeNFTRule(cidr *net.IPNet, isAdvertisedNetwork bool) (*knftables.Rule, error) { + // Create the rule for masquerading traffic from the CIDR + var ipPrefix string + var remoteNodeSetName string + if utilnet.IsIPv6CIDR(cidr) { + ipPrefix = "ip6" + remoteNodeSetName = types.NFTRemoteNodeIPsv6 + } else { + ipPrefix = "ip" + remoteNodeSetName = types.NFTRemoteNodeIPsv4 + } + + // If network is advertised, only masquerade if destination is a remote node IP + var optionalDestRules []string + if isAdvertisedNetwork { + optionalDestRules = []string{ipPrefix, "daddr", "@", remoteNodeSetName} + } + rule := &knftables.Rule{ + Rule: knftables.Concat( + ipPrefix, "saddr", cidr, + optionalDestRules, + "masquerade", + ), + Chain: nftablesPodSubnetMasqChain, + } + + return rule, nil +} + +// getLocalGatewayNATNFTRules returns the nftables rules for local gateway NAT including masquerade IP rule, +// pod subnet rules, and UDN masquerade rules (if network segmentation is enabled). +// This function supports dual-stack by accepting multiple CIDRs and generating rules for all IP families. +// +// chain ovn-kube-local-gw-masq { +// comment "OVN local gateway masquerade" +// type nat hook postrouting priority srcnat; policy accept; +// ip saddr 169.254.0.1 masquerade +// ip6 saddr fd69::1 masquerade +// jump ovn-kube-pod-subnet-masq +// jump ovn-kube-udn-masq +// } +func getLocalGatewayNATNFTRules(cidrs ...*net.IPNet) ([]*knftables.Rule, error) { + var rules []*knftables.Rule + + // Process each CIDR to support dual-stack + for _, cidr := range cidrs { + // Determine IP version and masquerade IP + isIPv6 := utilnet.IsIPv6CIDR(cidr) + var masqueradeIP net.IP + var ipPrefix string + if isIPv6 { + masqueradeIP = config.Gateway.MasqueradeIPs.V6OVNMasqueradeIP + ipPrefix = "ip6" + } else { + masqueradeIP = config.Gateway.MasqueradeIPs.V4OVNMasqueradeIP + ipPrefix = "ip" + } + + // Rule1: Masquerade IP rule for the main chain + masqRule := &knftables.Rule{ + Chain: nftablesLocalGatewayMasqChain, + Rule: knftables.Concat( + ipPrefix, "saddr", masqueradeIP, + "masquerade", + ), + } + rules = append(rules, masqRule) + + // Rule2: Pod subnet NAT rule for the pod subnet chain + podSubnetRule, err := getLocalGatewayPodSubnetMasqueradeNFTRule(cidr, false) + if err != nil { + return nil, fmt.Errorf("failed to create pod subnet masquerade rule: %w", err) + } + rules = append(rules, podSubnetRule) + } + + // Rule 3: UDN masquerade rules (if network segmentation is enabled) + if util.IsNetworkSegmentationSupportEnabled() { + if config.IPv4Mode { + udnRules, err := getUDNMasqueradeNFTRules(utilnet.IPv4) + if err != nil { + return nil, fmt.Errorf("failed to create IPv4 UDN masquerade rules: %w", err) + } + rules = append(rules, udnRules...) + } + if config.IPv6Mode { + udnRules, err := getUDNMasqueradeNFTRules(utilnet.IPv6) + if err != nil { + return nil, fmt.Errorf("failed to create IPv6 UDN masquerade rules: %w", err) + } + rules = append(rules, udnRules...) + } + } + + return rules, nil +} + +// getUDNMasqueradeNFTRules returns the nftables rules for UDN masquerade. +// Chain creation is handled separately by setupLocalGatewayNATNFTRules. +// +// chain ovn-kube-udn-masq { +// comment "OVN UDN masquerade" +// ip saddr != 169.254.0.0/29 ip daddr != 10.96.0.0/16 ip saddr 169.254.0.0/17 masquerade +// ip6 saddr != fd69::/125 ip daddr != fd00:10:96::/112 ip6 saddr fd69::/112 masquerade +// } +func getUDNMasqueradeNFTRules(ipFamily utilnet.IPFamily) ([]*knftables.Rule, error) { + var rules []*knftables.Rule + + // Determine subnet and IP family + srcUDNMasqueradePrefix := config.Gateway.V4MasqueradeSubnet + ipPrefix := "ip" + if ipFamily == utilnet.IPv6 { + srcUDNMasqueradePrefix = config.Gateway.V6MasqueradeSubnet + ipPrefix = "ip6" + } + + // Calculate reserved masquerade prefix (first 8 IPs) + _, ipnet, err := net.ParseCIDR(srcUDNMasqueradePrefix) + if err != nil { + return nil, fmt.Errorf("failed to parse UDN masquerade subnet: %w", err) + } + _, prefixLen := ipnet.Mask.Size() + defaultNetworkReservedMasqueradePrefix := fmt.Sprintf("%s/%d", ipnet.IP.String(), prefixLen-3) + + // Rule: RETURN for reserved masquerade prefix and service CIDRs + // rest of the traffic is masqueraded + + for _, svcCIDR := range config.Kubernetes.ServiceCIDRs { + if utilnet.IPFamilyOfCIDR(svcCIDR) != ipFamily { + continue + } + masqueradeRule := &knftables.Rule{ + Chain: nftablesUDNMasqChain, + Rule: knftables.Concat( + ipPrefix, "saddr", "!=", defaultNetworkReservedMasqueradePrefix, // this guarantees we don't SNAT default network masqueradeIPs + ipPrefix, "daddr", "!=", svcCIDR, // this guarantees we don't SNAT service traffic + ipPrefix, "saddr", srcUDNMasqueradePrefix, // this guarantees we SNAT all UDN MasqueradeIPs traffic leaving the node + "masquerade", + ), + } + rules = append(rules, masqueradeRule) + } + + return rules, nil +} + +// initLocalGatewayNFTNATRules sets up nftables rules for local gateway NAT functionality +// This function supports dual-stack by accepting multiple CIDRs and generating rules for all IP families +func initLocalGatewayNFTNATRules(cidrs ...*net.IPNet) error { + nft, err := nodenft.GetNFTablesHelper() + if err != nil { + return fmt.Errorf("failed to get nftables helper: %w", err) + } + + // Create transaction and apply all chains and rules + tx := nft.NewTransaction() + + // Create main local gateway masquerade chain + // Use priority 101 instead of defaultknftables.SNATPriority (100) to ensure + // iptables egress IP rules in OVN-KUBE-EGRESS-IP-MULTI-NIC chain run first + // this also ensure for egress-services, the + // chain egress-services { + // type nat hook postrouting priority srcnat; policy accept; + // is called before the local gateway masquerade chain + localGwMasqChain := &knftables.Chain{ + Name: nftablesLocalGatewayMasqChain, + Comment: knftables.PtrTo("OVN local gateway masquerade"), + Type: knftables.PtrTo(knftables.NATType), + Hook: knftables.PtrTo(knftables.PostroutingHook), + Priority: knftables.PtrTo(knftables.BaseChainPriority("101")), + } + tx.Add(localGwMasqChain) + + // Create dedicated pod subnet masquerade chain + podSubnetMasqChain := &knftables.Chain{ + Name: nftablesPodSubnetMasqChain, + } + tx.Add(podSubnetMasqChain) + + // Create UDN masquerade chain only if network segmentation is enabled + var udnMasqChain *knftables.Chain + if util.IsNetworkSegmentationSupportEnabled() { + udnMasqChain = &knftables.Chain{ + Name: nftablesUDNMasqChain, + Comment: knftables.PtrTo("OVN UDN masquerade"), + } + tx.Add(udnMasqChain) + } + + // Flush existing chains to ensure clean state + tx.Flush(localGwMasqChain) + tx.Flush(podSubnetMasqChain) + if util.IsNetworkSegmentationSupportEnabled() { + tx.Flush(udnMasqChain) + } + + // Get the existing local gateway NAT rules + localGwRules, err := getLocalGatewayNATNFTRules(cidrs...) + if err != nil { + return fmt.Errorf("failed to get local gateway NAT rules: %w", err) + } + + // Add the main local gateway NAT rules + for _, rule := range localGwRules { + tx.Add(rule) + } + + // Add jump rule from main chain to pod subnet chain + jumpToPodSubnetRule := &knftables.Rule{ + Chain: nftablesLocalGatewayMasqChain, + Rule: knftables.Concat( + "jump", nftablesPodSubnetMasqChain, + ), + } + tx.Add(jumpToPodSubnetRule) + + // Add jump rule to UDN chain only if network segmentation is enabled + if util.IsNetworkSegmentationSupportEnabled() { + jumpToUDNRule := &knftables.Rule{ + Chain: nftablesLocalGatewayMasqChain, + Rule: knftables.Concat( + "jump", nftablesUDNMasqChain, + ), + } + tx.Add(jumpToUDNRule) + } + + err = nft.Run(context.TODO(), tx) + if err != nil { + return fmt.Errorf("failed to setup local gateway NAT nftables rules: %w", err) + } + + return nil +} + +// addOrUpdateLocalGatewayPodSubnetNFTRules adds nftables rules for pod subnet masquerading for multiple CIDRs +// These rules are added to the dedicated pod subnet masquerade chain. +// If the rules already exist, they are updated. +// If isAdvertisedNetwork is true, the masquerade rules also get a destination match +// that matches the remote node IP set. +func addOrUpdateLocalGatewayPodSubnetNFTRules(isAdvertisedNetwork bool, cidrs ...*net.IPNet) error { + nft, err := nodenft.GetNFTablesHelper() + if err != nil { + return fmt.Errorf("failed to get nftables helper: %w", err) + } + + tx := nft.NewTransaction() + + // Ensure the pod subnet chain exists + podSubnetChain := &knftables.Chain{ + Name: nftablesPodSubnetMasqChain, + } + tx.Add(podSubnetChain) + + // Flush the chain to remove all existing rules + // if network toggles between advertised and non-advertised, we need to flush the chain and re-add correct rules + tx.Flush(podSubnetChain) + + // Add the new rules for each CIDR + for _, cidr := range cidrs { + rule, err := getLocalGatewayPodSubnetMasqueradeNFTRule(cidr, isAdvertisedNetwork) + if err != nil { + return fmt.Errorf("failed to create nftables rules for CIDR %s: %w", cidr.String(), err) + } + + // Add the rule + tx.Add(rule) + } + + if err := nft.Run(context.TODO(), tx); err != nil { + return fmt.Errorf("failed to add pod subnet NAT rules: %w", err) + } + + return nil +} + +// delLocalGatewayPodSubnetNFTRules removes nftables rules for pod subnet masquerading for multiple CIDRs +// Since we use a separate chain, we can simply flush it to remove all pod subnet rules. +func delLocalGatewayPodSubnetNFTRules() error { + nft, err := nodenft.GetNFTablesHelper() + if err != nil { + return fmt.Errorf("failed to get nftables helper: %w", err) + } + + tx := nft.NewTransaction() + + // In shared gateway mode, this chain might not exist if its + // not migration from local gateway mode. In that case, let's + // use the idiomatic way of adding the chain before trying to flush it. + // I anyways also have the knftables.IsNotFound() check in the caller later. + tx.Add(&knftables.Chain{ + Name: nftablesPodSubnetMasqChain, + }) + + // Simply flush the dedicated pod subnet masquerade chain + // This removes all pod subnet masquerade rules at once + tx.Flush(&knftables.Chain{Name: nftablesPodSubnetMasqChain}) + + if err := nft.Run(context.TODO(), tx); err != nil && !knftables.IsNotFound(err) { + return fmt.Errorf("failed to delete pod subnet NAT rules: %w", err) + } + + return nil +} diff --git a/go-controller/pkg/node/gateway_udn.go b/go-controller/pkg/node/gateway_udn.go index 026ecd94fc..32512d8287 100644 --- a/go-controller/pkg/node/gateway_udn.go +++ b/go-controller/pkg/node/gateway_udn.go @@ -89,6 +89,10 @@ type UserDefinedNetworkGateway struct { // gwInterfaceIndex holds the link index of gateway interface gwInterfaceIndex int + + // save BGP state at the start of reconciliation loop run to handle it consistently throughout the run + isNetworkAdvertisedToDefaultVRF bool + isNetworkAdvertised bool } func NewUserDefinedNetworkGateway(netInfo util.NetInfo, node *corev1.Node, nodeLister listers.NodeLister, @@ -225,18 +229,18 @@ func (udng *UserDefinedNetworkGateway) AddNetwork() error { return fmt.Errorf("could not add VRF %s routes for network %s, err: %v", vrfDeviceName, udng.GetNetworkName(), err) } - isNetworkAdvertised := util.IsPodNetworkAdvertisedAtNode(udng.NetInfo, udng.node.Name) + udng.updateAdvertisementStatus() // create the iprules for this network - if err = udng.updateUDNVRFIPRules(isNetworkAdvertised); err != nil { + if err = udng.updateUDNVRFIPRules(); err != nil { return fmt.Errorf("failed to update IP rules for network %s: %w", udng.GetNetworkName(), err) } - if err = udng.updateAdvertisedUDNIsolationRules(isNetworkAdvertised); err != nil { + if err = udng.updateAdvertisedUDNIsolationRules(); err != nil { return fmt.Errorf("failed to update isolation rules for network %s: %w", udng.GetNetworkName(), err) } - if err := udng.updateUDNVRFIPRoute(isNetworkAdvertised); err != nil { + if err := udng.updateUDNVRFIPRoute(); err != nil { return fmt.Errorf("failed to update ip routes for network %s: %w", udng.GetNetworkName(), err) } @@ -314,18 +318,16 @@ func (udng *UserDefinedNetworkGateway) DelNetwork() error { } } - if util.IsPodNetworkAdvertisedAtNode(udng.NetInfo, udng.node.Name) { - err := udng.updateAdvertisedUDNIsolationRules(false) - if err != nil { - return fmt.Errorf("failed to remove advertised UDN isolation rules for network %s: %w", udng.GetNetworkName(), err) - } + err := udng.deleteAdvertisedUDNIsolationRules() + if err != nil { + return fmt.Errorf("failed to remove advertised UDN isolation rules for network %s: %w", udng.GetNetworkName(), err) } if err := udng.delMarkChain(); err != nil { return err } // delete the management port interface for this network - err := udng.deleteUDNManagementPort() + err = udng.deleteUDNManagementPort() if err != nil { return err } @@ -483,8 +485,7 @@ func (udng *UserDefinedNetworkGateway) computeRoutesForUDN(mpLink netlink.Link) // Route2: Add default route: default via 172.18.0.1 dev breth0 mtu 1400 // necessary for UDN CNI and host-networked pods default traffic to go to node's gatewayIP - isNetworkAdvertised := util.IsPodNetworkAdvertisedAtNode(udng.NetInfo, udng.node.Name) - defaultRoute, err := udng.getDefaultRoute(isNetworkAdvertised) + defaultRoute, err := udng.getDefaultRouteExceptIfVRFLite() if err != nil { return nil, fmt.Errorf("unable to add default route for network %s, err: %v", udng.GetNetworkName(), err) } @@ -585,15 +586,7 @@ func (udng *UserDefinedNetworkGateway) computeRoutesForUDN(mpLink netlink.Link) return retVal, nil } -func (udng *UserDefinedNetworkGateway) getDefaultRoute(isNetworkAdvertised bool) ([]netlink.Route, error) { - vrfs := udng.GetPodNetworkAdvertisedOnNodeVRFs(udng.node.Name) - // If the network is advertised on a non default VRF then we should only consider routes received from external BGP - // device and not send any traffic based on default route similar to one present in default VRF. This is more important - // for VRF-Lite usecase where we need traffic to leave from vlan device instead of default gateway interface. - if isNetworkAdvertised && !slices.Contains(vrfs, types.DefaultNetworkName) { - return nil, nil - } - +func (udng *UserDefinedNetworkGateway) getDefaultRoute() ([]netlink.Route, error) { networkMTU := udng.NetInfo.MTU() if networkMTU == 0 { networkMTU = config.Default.MTU @@ -618,6 +611,16 @@ func (udng *UserDefinedNetworkGateway) getDefaultRoute(isNetworkAdvertised bool) return retVal, nil } +func (udng *UserDefinedNetworkGateway) getDefaultRouteExceptIfVRFLite() ([]netlink.Route, error) { + // If the network is advertised on a non default VRF then we should only consider routes received from external BGP + // device and not send any traffic based on default route similar to one present in default VRF. This is more important + // for VRF-Lite usecase where we need traffic to leave from vlan device instead of default gateway interface. + if udng.isNetworkAdvertised && !udng.isNetworkAdvertisedToDefaultVRF { + return nil, nil + } + return udng.getDefaultRoute() +} + // getV4MasqueradeIP returns the V4 management port masqueradeIP for this network func (udng *UserDefinedNetworkGateway) getV4MasqueradeIP() (*net.IPNet, error) { if !config.IPv4Mode { @@ -644,18 +647,18 @@ func (udng *UserDefinedNetworkGateway) getV6MasqueradeIP() (*net.IPNet, error) { // constructUDNVRFIPRules constructs rules that redirect matching packets // into the corresponding UDN VRF routing table. -// If the network is not advertised, an example of the rules we set for a -// network is: -// 2000: from all fwmark 0x1001 lookup 1007 -// 2000: from all to 169.254.0.12 lookup 1007 -// 2000: from all fwmark 0x1002 lookup 1009 -// 2000: from all to 169.254.0.14 lookup 1009 -// If the network is advertised, an example of the rules we set for a network is: +// +// When a network is not advertised on the default VRF, an example of the rules +// we set for it is: +// 2000: from all fwmark 0x1001 lookup 1007 +// 2000: from all to 169.254.0.12 lookup 1007 +// +// When a network is advertised on the default VRF, an example of the rules +// we set for it is: // 2000: from all fwmark 0x1001 lookup 1007 // 2000: from all to 10.132.0.0/14 lookup 1007 -// 2000: from all fwmark 0x1001 lookup 1009 -// 2000: from all to 10.134.0.0/14 lookup 1009 -func (udng *UserDefinedNetworkGateway) constructUDNVRFIPRules(isNetworkAdvertised bool) ([]netlink.Rule, []netlink.Rule, error) { +// 2000: from all to 169.254.0.12 lookup 1007 +func (udng *UserDefinedNetworkGateway) constructUDNVRFIPRules() ([]netlink.Rule, []netlink.Rule, error) { var addIPRules []netlink.Rule var delIPRules []netlink.Rule var masqIPRules []netlink.Rule @@ -688,12 +691,14 @@ func (udng *UserDefinedNetworkGateway) constructUDNVRFIPRules(isNetworkAdvertise } } switch { - case !isNetworkAdvertised: + case udng.isNetworkAdvertisedToDefaultVRF: + // the network is advertised to the default VRF addIPRules = append(addIPRules, masqIPRules...) - delIPRules = append(delIPRules, subnetIPRules...) - default: addIPRules = append(addIPRules, subnetIPRules...) - delIPRules = append(delIPRules, masqIPRules...) + default: + // network is not advertised on the default VRF + addIPRules = append(addIPRules, masqIPRules...) + delIPRules = append(delIPRules, subnetIPRules...) } return addIPRules, delIPRules, nil } @@ -791,19 +796,20 @@ func (udng *UserDefinedNetworkGateway) doReconcile() error { return fmt.Errorf("openflow manager with default bridge configuration has not been provided for network %s", udng.GetNetworkName()) } + udng.updateAdvertisementStatus() + // update bridge configuration - isNetworkAdvertised := util.IsPodNetworkAdvertisedAtNode(udng.NetInfo, udng.node.Name) netConfig := udng.openflowManager.defaultBridge.GetNetworkConfig(udng.GetNetworkName()) if netConfig == nil { return fmt.Errorf("missing bridge configuration for network %s", udng.GetNetworkName()) } - netConfig.Advertised.Store(isNetworkAdvertised) + netConfig.Advertised.Store(udng.isNetworkAdvertised) - if err := udng.updateUDNVRFIPRules(isNetworkAdvertised); err != nil { + if err := udng.updateUDNVRFIPRules(); err != nil { return fmt.Errorf("error while updating ip rule for UDN %s: %s", udng.GetNetworkName(), err) } - if err := udng.updateUDNVRFIPRoute(isNetworkAdvertised); err != nil { + if err := udng.updateUDNVRFIPRoute(); err != nil { return fmt.Errorf("error while updating ip route for UDN %s: %s", udng.GetNetworkName(), err) } @@ -817,16 +823,16 @@ func (udng *UserDefinedNetworkGateway) doReconcile() error { // let's sync these flows immediately udng.openflowManager.requestFlowSync() - if err := udng.updateAdvertisedUDNIsolationRules(isNetworkAdvertised); err != nil { + if err := udng.updateAdvertisedUDNIsolationRules(); err != nil { return fmt.Errorf("error while updating advertised UDN isolation rules for network %s: %w", udng.GetNetworkName(), err) } return nil } // updateUDNVRFIPRules updates IP rules for a network depending on whether the -// network is advertised or not -func (udng *UserDefinedNetworkGateway) updateUDNVRFIPRules(isNetworkAdvertised bool) error { - addIPRules, deleteIPRules, err := udng.constructUDNVRFIPRules(isNetworkAdvertised) +// network is advertised to the default VRF or not +func (udng *UserDefinedNetworkGateway) updateUDNVRFIPRules() error { + addIPRules, deleteIPRules, err := udng.constructUDNVRFIPRules() if err != nil { return fmt.Errorf("unable to get iprules for network %s, err: %v", udng.GetNetworkName(), err) } @@ -845,30 +851,40 @@ func (udng *UserDefinedNetworkGateway) updateUDNVRFIPRules(isNetworkAdvertised b } // Add or remove default route from a vrf device based on the network is -// advertised on its own network or default network -func (udng *UserDefinedNetworkGateway) updateUDNVRFIPRoute(isNetworkAdvertised bool) error { - vrfs := udng.GetPodNetworkAdvertisedOnNodeVRFs(udng.node.Name) - if isNetworkAdvertised && !slices.Contains(vrfs, types.DefaultNetworkName) { +// advertised on its own network or the default network +func (udng *UserDefinedNetworkGateway) updateUDNVRFIPRoute() error { + vrfName := util.GetNetworkVRFName(udng.NetInfo) + + switch { + case udng.isNetworkAdvertised && !udng.isNetworkAdvertisedToDefaultVRF: + // Remove default route for networks advertised to non-default VRF if err := udng.removeDefaultRouteFromVRF(); err != nil { - return fmt.Errorf("error while removing default route from VRF %s corresponding to network %s: %s", - util.GetNetworkVRFName(udng.NetInfo), udng.GetNetworkName(), err) + return fmt.Errorf("failed to remove default route from VRF %s for network %s: %v", + vrfName, udng.GetNetworkName(), err) } - } else if !isNetworkAdvertised || slices.Contains(vrfs, types.DefaultNetworkName) { - defaultRoute, err := udng.getDefaultRoute(isNetworkAdvertised) + + default: + // Add default route for networks that are either: + // - not advertised + // - advertised to default VRF + defaultRoute, err := udng.getDefaultRouteExceptIfVRFLite() if err != nil { - return fmt.Errorf("unable to get default route for network %s, err: %v", udng.GetNetworkName(), err) + return fmt.Errorf("failed to get default route for network %s: %v", + udng.GetNetworkName(), err) } - if err = udng.vrfManager.AddVRFRoutes(util.GetNetworkVRFName(udng.NetInfo), defaultRoute); err != nil { - return fmt.Errorf("error while adding default route to VRF %s corresponding to network %s, err: %v", - util.GetNetworkVRFName(udng.NetInfo), udng.GetNetworkName(), err) + + if err = udng.vrfManager.AddVRFRoutes(vrfName, defaultRoute); err != nil { + return fmt.Errorf("failed to add default route to VRF %s for network %s: %v", + vrfName, udng.GetNetworkName(), err) } } + return nil } func (udng *UserDefinedNetworkGateway) removeDefaultRouteFromVRF() error { vrfDeviceName := util.GetNetworkVRFName(udng.NetInfo) - defaultRoute, err := udng.getDefaultRoute(false) + defaultRoute, err := udng.getDefaultRoute() if err != nil { return fmt.Errorf("unable to get default route for network %s, err: %v", udng.GetNetworkName(), err) } @@ -897,39 +913,22 @@ func (udng *UserDefinedNetworkGateway) removeDefaultRouteFromVRF() error { // comment "advertised UDNs V4 subnets" // elements = { 10.10.0.0/16 comment "cluster_udn_l3network" } // } -func (udng *UserDefinedNetworkGateway) updateAdvertisedUDNIsolationRules(isNetworkAdvertised bool) error { +func (udng *UserDefinedNetworkGateway) updateAdvertisedUDNIsolationRules() error { + switch { + case udng.isNetworkAdvertised: + return udng.addAdvertisedUDNIsolationRules() + default: + return udng.deleteAdvertisedUDNIsolationRules() + } +} + +func (udng *UserDefinedNetworkGateway) addAdvertisedUDNIsolationRules() error { nft, err := nodenft.GetNFTablesHelper() if err != nil { return fmt.Errorf("failed to get nftables helper: %v", err) } tx := nft.NewTransaction() - if !isNetworkAdvertised { - existingV4, err := nft.ListElements(context.TODO(), "set", nftablesAdvertisedUDNsSetV4) - if err != nil { - if !knftables.IsNotFound(err) { - return fmt.Errorf("could not list existing items in %s set: %w", nftablesAdvertisedUDNsSetV4, err) - } - } - existingV6, err := nft.ListElements(context.TODO(), "set", nftablesAdvertisedUDNsSetV6) - if err != nil { - if !knftables.IsNotFound(err) { - return fmt.Errorf("could not list existing items in %s set: %w", nftablesAdvertisedUDNsSetV6, err) - } - } - - for _, elem := range append(existingV4, existingV6...) { - if elem.Comment != nil && *elem.Comment == udng.GetNetworkName() { - tx.Delete(elem) - } - } - - if tx.NumOperations() == 0 { - return nil - } - return nft.Run(context.TODO(), tx) - } - for _, udnNet := range udng.Subnets() { set := nftablesAdvertisedUDNsSetV4 if utilnet.IsIPv6CIDR(udnNet.CIDR) { @@ -948,3 +947,41 @@ func (udng *UserDefinedNetworkGateway) updateAdvertisedUDNIsolationRules(isNetwo } return nft.Run(context.TODO(), tx) } + +func (udng *UserDefinedNetworkGateway) deleteAdvertisedUDNIsolationRules() error { + nft, err := nodenft.GetNFTablesHelper() + if err != nil { + return fmt.Errorf("failed to get nftables helper: %v", err) + } + tx := nft.NewTransaction() + + existingV4, err := nft.ListElements(context.TODO(), "set", nftablesAdvertisedUDNsSetV4) + if err != nil { + if !knftables.IsNotFound(err) { + return fmt.Errorf("could not list existing items in %s set: %w", nftablesAdvertisedUDNsSetV4, err) + } + } + existingV6, err := nft.ListElements(context.TODO(), "set", nftablesAdvertisedUDNsSetV6) + if err != nil { + if !knftables.IsNotFound(err) { + return fmt.Errorf("could not list existing items in %s set: %w", nftablesAdvertisedUDNsSetV6, err) + } + } + + for _, elem := range append(existingV4, existingV6...) { + if elem.Comment != nil && *elem.Comment == udng.GetNetworkName() { + tx.Delete(elem) + } + } + + if tx.NumOperations() == 0 { + return nil + } + return nft.Run(context.TODO(), tx) +} + +func (udng *UserDefinedNetworkGateway) updateAdvertisementStatus() { + vrfs := udng.GetPodNetworkAdvertisedOnNodeVRFs(udng.node.Name) + udng.isNetworkAdvertised = len(vrfs) > 0 + udng.isNetworkAdvertisedToDefaultVRF = slices.Contains(vrfs, types.DefaultNetworkName) +} diff --git a/go-controller/pkg/node/gateway_udn_test.go b/go-controller/pkg/node/gateway_udn_test.go index 34848faf7e..4611ea97ed 100644 --- a/go-controller/pkg/node/gateway_udn_test.go +++ b/go-controller/pkg/node/gateway_udn_test.go @@ -1143,7 +1143,7 @@ var _ = Describe("UserDefinedNetworkGateway", func() { Expect(udnGateway.AddNetwork()).To(Succeed()) flowMap = udnGateway.gateway.openflowManager.flowCache - Expect(flowMap["DEFAULT"]).To(HaveLen(69)) // 18 UDN Flows and 5 advertisedUDN flows are added by default + Expect(flowMap["DEFAULT"]).To(HaveLen(73)) // 18 UDN Flows, 5 advertisedUDN flows, and 2 packet mark flows (IPv4+IPv6) are added by default Expect(udnGateway.openflowManager.defaultBridge.GetNetConfigLen()).To(Equal(2)) // default network + UDN network defaultUdnConfig := udnGateway.openflowManager.defaultBridge.GetNetworkConfig("default") bridgeUdnConfig := udnGateway.openflowManager.defaultBridge.GetNetworkConfig("bluenet") @@ -1159,14 +1159,16 @@ var _ = Describe("UserDefinedNetworkGateway", func() { } } } - Expect(udnFlows).To(Equal(14)) + Expect(udnFlows).To(Equal(16)) openflowManagerCheckPorts(udnGateway.openflowManager) for _, svcCIDR := range config.Kubernetes.ServiceCIDRs { // Check flows for default network service CIDR. bridgeconfig.CheckDefaultSvcIsolationOVSFlows(flowMap["DEFAULT"], defaultUdnConfig, ofPortHost, bridgeMAC, svcCIDR) - // Expect exactly one flow per advertised UDN for table 2 and table 0 for service isolation. + // Expect exactly two flow per advertised UDN for table 2 and table 0 for service isolation. + // but one of the flows used by advertised UDNs is already tracked and used by default UDNs hence not + // counted here but in the check above for default svc isolation flows. bridgeconfig.CheckAdvertisedUDNSvcIsolationOVSFlows(flowMap["DEFAULT"], bridgeUdnConfig, "bluenet", svcCIDR, 2) } @@ -1625,7 +1627,6 @@ func TestConstructUDNVRFIPRules(t *testing.T) { cidr := "" if config.IPv4Mode { cidr = "100.128.0.0/16/24" - } if config.IPv4Mode && config.IPv6Mode { cidr += ",ae70::/60/64" @@ -1655,7 +1656,7 @@ func TestConstructUDNVRFIPRules(t *testing.T) { }) g.Expect(err).NotTo(HaveOccurred()) udnGateway.vrfTableId = test.vrftableID - rules, delRules, err := udnGateway.constructUDNVRFIPRules(false) + rules, delRules, err := udnGateway.constructUDNVRFIPRules() g.Expect(err).ToNot(HaveOccurred()) for i, rule := range rules { g.Expect(rule.Priority).To(Equal(test.expectedRules[i].priority)) @@ -1677,7 +1678,7 @@ func TestConstructUDNVRFIPRules(t *testing.T) { } } -func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { +func TestConstructUDNVRFIPRulesPodNetworkAdvertisedToDefaultVRF(t *testing.T) { type testRule struct { priority int family int @@ -1705,6 +1706,12 @@ func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { table: 1007, mark: 0x1003, }, + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V4, + table: 1007, + dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("169.254.0.16")), + }, { priority: UDNMasqueradeIPRulePriority, family: netlink.FAMILY_V4, @@ -1712,7 +1719,176 @@ func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { dst: *ovntest.MustParseIPNet("100.128.0.0/16"), }, }, - deleteRules: []testRule{ + v4mode: true, + }, + { + desc: "v6 rule test", + vrftableID: 1009, + expectedRules: []testRule{ + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V6, + table: 1009, + mark: 0x1003, + }, + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V6, + table: 1009, + dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("fd69::10")), + }, + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V6, + table: 1009, + dst: *ovntest.MustParseIPNet("ae70::/60"), + }, + }, + v6mode: true, + }, + { + desc: "dualstack rule test", + vrftableID: 1010, + expectedRules: []testRule{ + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V4, + table: 1010, + mark: 0x1003, + }, + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V6, + table: 1010, + mark: 0x1003, + }, + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V4, + table: 1010, + dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("169.254.0.16")), + }, + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V6, + table: 1010, + dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("fd69::10")), + }, + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V4, + table: 1010, + dst: *ovntest.MustParseIPNet("100.128.0.0/16"), + }, + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V6, + table: 1010, + dst: *ovntest.MustParseIPNet("ae70::/60"), + }, + }, + v4mode: true, + v6mode: true, + }, + } + config.Gateway.V6MasqueradeSubnet = "fd69::/112" + config.Gateway.V4MasqueradeSubnet = "169.254.0.0/16" + for _, test := range tests { + t.Run(test.desc, func(t *testing.T) { + g := NewWithT(t) + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + } + config.IPv4Mode = test.v4mode + config.IPv6Mode = test.v6mode + cidr := "" + if config.IPv4Mode { + cidr = "100.128.0.0/16/24" + } + if config.IPv4Mode && config.IPv6Mode { + cidr += ",ae70::/60/64" + } else if config.IPv6Mode { + cidr = "ae70::/60/64" + } + nad := ovntest.GenerateNAD("bluenet", "rednad", "greenamespace", + types.Layer3Topology, cidr, types.NetworkRolePrimary) + ovntest.AnnotateNADWithNetworkID("3", nad) + netInfo, err := util.ParseNADInfo(nad) + g.Expect(err).ToNot(HaveOccurred()) + mutableNetInfo := util.NewMutableNetInfo(netInfo) + mutableNetInfo.SetPodNetworkAdvertisedVRFs(map[string][]string{node.Name: {"bluenet"}}) + ofm := getDummyOpenflowManager() + // create dummy gateway interface(Need to run this test as root) + err = netlink.LinkAdd(&netlink.Dummy{ + LinkAttrs: netlink.LinkAttrs{ + Name: "breth0", + }, + }) + g.Expect(err).NotTo(HaveOccurred()) + udnGateway, err := NewUserDefinedNetworkGateway(mutableNetInfo, node, nil, nil, nil, nil, &gateway{openflowManager: ofm}) + g.Expect(err).NotTo(HaveOccurred()) + // delete dummy gateway interface after creating UDN gateway(Need to run this test as root) + err = netlink.LinkDel(&netlink.Dummy{ + LinkAttrs: netlink.LinkAttrs{ + Name: "breth0", + }, + }) + g.Expect(err).NotTo(HaveOccurred()) + udnGateway.vrfTableId = test.vrftableID + udnGateway.isNetworkAdvertised = true + udnGateway.isNetworkAdvertisedToDefaultVRF = true + rules, delRules, err := udnGateway.constructUDNVRFIPRules() + g.Expect(err).ToNot(HaveOccurred()) + for i, rule := range rules { + g.Expect(rule.Priority).To(Equal(test.expectedRules[i].priority)) + g.Expect(rule.Table).To(Equal(test.expectedRules[i].table)) + g.Expect(rule.Family).To(Equal(test.expectedRules[i].family)) + if rule.Dst != nil { + g.Expect(*rule.Dst).To(Equal(test.expectedRules[i].dst)) + } else { + g.Expect(rule.Mark).To(Equal(test.expectedRules[i].mark)) + } + } + for i, rule := range delRules { + g.Expect(rule.Priority).To(Equal(test.deleteRules[i].priority)) + g.Expect(rule.Table).To(Equal(test.deleteRules[i].table)) + g.Expect(rule.Family).To(Equal(test.deleteRules[i].family)) + g.Expect(*rule.Dst).To(Equal(test.deleteRules[i].dst)) + } + }) + } +} + +func TestConstructUDNVRFIPRulesPodNetworkAdvertisedToNonDefaultVRF(t *testing.T) { + type testRule struct { + priority int + family int + table int + mark uint32 + dst net.IPNet + } + type testConfig struct { + desc string + vrftableID int + v4mode bool + v6mode bool + expectedRules []testRule + deleteRules []testRule + } + + tests := []testConfig{ + { + desc: "v4 rule test", + vrftableID: 1007, + expectedRules: []testRule{ + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V4, + table: 1007, + mark: 0x1003, + }, { priority: UDNMasqueradeIPRulePriority, family: netlink.FAMILY_V4, @@ -1720,6 +1896,14 @@ func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("169.254.0.16")), }, }, + deleteRules: []testRule{ + { + priority: UDNMasqueradeIPRulePriority, + family: netlink.FAMILY_V4, + table: 1007, + dst: *ovntest.MustParseIPNet("100.128.0.0/16"), + }, + }, v4mode: true, }, { @@ -1736,7 +1920,7 @@ func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { priority: UDNMasqueradeIPRulePriority, family: netlink.FAMILY_V6, table: 1009, - dst: *ovntest.MustParseIPNet("ae70::/60"), + dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("fd69::10")), }, }, deleteRules: []testRule{ @@ -1744,7 +1928,7 @@ func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { priority: UDNMasqueradeIPRulePriority, family: netlink.FAMILY_V6, table: 1009, - dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("fd69::10")), + dst: *ovntest.MustParseIPNet("ae70::/60"), }, }, v6mode: true, @@ -1769,13 +1953,13 @@ func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { priority: UDNMasqueradeIPRulePriority, family: netlink.FAMILY_V4, table: 1010, - dst: *ovntest.MustParseIPNet("100.128.0.0/16"), + dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("169.254.0.16")), }, { priority: UDNMasqueradeIPRulePriority, family: netlink.FAMILY_V6, table: 1010, - dst: *ovntest.MustParseIPNet("ae70::/60"), + dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("fd69::10")), }, }, deleteRules: []testRule{ @@ -1783,13 +1967,13 @@ func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { priority: UDNMasqueradeIPRulePriority, family: netlink.FAMILY_V4, table: 1010, - dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("169.254.0.16")), + dst: *ovntest.MustParseIPNet("100.128.0.0/16"), }, { priority: UDNMasqueradeIPRulePriority, family: netlink.FAMILY_V6, table: 1010, - dst: *util.GetIPNetFullMaskFromIP(ovntest.MustParseIP("fd69::10")), + dst: *ovntest.MustParseIPNet("ae70::/60"), }, }, v4mode: true, @@ -1842,8 +2026,12 @@ func TestConstructUDNVRFIPRulesPodNetworkAdvertised(t *testing.T) { }) g.Expect(err).NotTo(HaveOccurred()) udnGateway.vrfTableId = test.vrftableID - rules, delRules, err := udnGateway.constructUDNVRFIPRules(true) + udnGateway.isNetworkAdvertised = true + udnGateway.isNetworkAdvertisedToDefaultVRF = false + rules, delRules, err := udnGateway.constructUDNVRFIPRules() g.Expect(err).ToNot(HaveOccurred()) + g.Expect(rules).To(HaveLen(len(test.expectedRules))) + g.Expect(delRules).To(HaveLen(len(test.deleteRules))) for i, rule := range rules { g.Expect(rule.Priority).To(Equal(test.expectedRules[i].priority)) g.Expect(rule.Table).To(Equal(test.expectedRules[i].table)) @@ -1973,7 +2161,8 @@ func TestUserDefinedNetworkGateway_updateAdvertisedUDNIsolationRules(t *testing. udng := &UserDefinedNetworkGateway{ NetInfo: netInfo, } - err = udng.updateAdvertisedUDNIsolationRules(tt.isNetworkAdvertised) + udng.isNetworkAdvertised = tt.isNetworkAdvertised + err = udng.updateAdvertisedUDNIsolationRules() g.Expect(err).NotTo(HaveOccurred()) v4Elems, err := nft.ListElements(context.TODO(), "set", nftablesAdvertisedUDNsSetV4) diff --git a/go-controller/pkg/node/node_nftables.go b/go-controller/pkg/node/node_nftables.go index e52a8970a4..ca4afc9ac2 100644 --- a/go-controller/pkg/node/node_nftables.go +++ b/go-controller/pkg/node/node_nftables.go @@ -13,8 +13,8 @@ import ( const nftPMTUDChain = "no-pmtud" -// setupPMTUDNFTSets sets up the NFT sets that contain remote Kubernetes node IPs -func setupPMTUDNFTSets() error { +// setupRemoteNodeNFTSets sets up the NFT sets that contain remote Kubernetes node IPs +func setupRemoteNodeNFTSets() error { nft, err := nodenft.GetNFTablesHelper() if err != nil { return fmt.Errorf("failed to get nftables helper: %w", err) @@ -22,12 +22,12 @@ func setupPMTUDNFTSets() error { tx := nft.NewTransaction() tx.Add(&knftables.Set{ - Name: types.NFTNoPMTUDRemoteNodeIPsv4, + Name: types.NFTRemoteNodeIPsv4, Comment: knftables.PtrTo("Block egress ICMP needs frag to remote Kubernetes nodes"), Type: "ipv4_addr", }) tx.Add(&knftables.Set{ - Name: types.NFTNoPMTUDRemoteNodeIPsv6, + Name: types.NFTRemoteNodeIPsv6, Comment: knftables.PtrTo("Block egress ICMPv6 packet too big to remote Kubernetes nodes"), Type: "ipv6_addr", }) @@ -68,7 +68,7 @@ func setupPMTUDNFTChain() error { tx.Add(&knftables.Rule{ Chain: nftPMTUDChain, Rule: knftables.Concat( - "ip daddr @"+types.NFTNoPMTUDRemoteNodeIPsv4, + "ip daddr @"+types.NFTRemoteNodeIPsv4, "meta l4proto icmp", "icmp type 3", // type 3 == Destination Unreachable "icmp code 4", // code 4 indicates fragmentation needed @@ -85,7 +85,7 @@ func setupPMTUDNFTChain() error { "meta l4proto icmpv6", // match on ICMPv6 packets "icmpv6 type 2", // type 2 == Packet Too Big (PMTUD) "icmpv6 code 0", // code 0 for that message - "ip6 daddr @"+types.NFTNoPMTUDRemoteNodeIPsv6, + "ip6 daddr @"+types.NFTRemoteNodeIPsv6, counterIfDebug, "drop", // drop the packet ), diff --git a/go-controller/pkg/node/obj_retry_node.go b/go-controller/pkg/node/obj_retry_node.go index 9c9657678e..646cca2ac3 100644 --- a/go-controller/pkg/node/obj_retry_node.go +++ b/go-controller/pkg/node/obj_retry_node.go @@ -238,34 +238,43 @@ func (h *nodeEventHandler) UpdateResource(oldObj, newObj interface{}, _ bool) er return nil } - // remote node that is changing - ipsToKeep := map[string]bool{} - for _, address := range newNode.Status.Addresses { - if address.Type != corev1.NodeInternalIP { - continue + if util.NodeHostCIDRsAnnotationChanged(oldNode, newNode) { + // remote node that is changing + // Use GetNodeAddresses to get new node IPs + newIPsv4, newIPsv6, err := util.GetNodeAddresses(config.IPv4Mode, config.IPv6Mode, newNode) + if err != nil { + return fmt.Errorf("failed to get addresses for new node %q: %w", newNode.Name, err) } - nodeIP := net.ParseIP(address.Address) - if nodeIP == nil { - continue + + ipsToKeep := map[string]bool{} + for _, nodeIP := range newIPsv4 { + ipsToKeep[nodeIP.String()] = true } - ipsToKeep[nodeIP.String()] = true - } - ipsToRemove := make([]net.IP, 0) - for _, address := range oldNode.Status.Addresses { - if address.Type != corev1.NodeInternalIP { - continue + for _, nodeIP := range newIPsv6 { + ipsToKeep[nodeIP.String()] = true } - nodeIP := net.ParseIP(address.Address) - if nodeIP == nil { - continue + + // Use GetNodeAddresses to get old node IPs + oldIPsv4, oldIPsv6, err := util.GetNodeAddresses(config.IPv4Mode, config.IPv6Mode, oldNode) + if err != nil { + return fmt.Errorf("failed to get addresses for old node %q: %w", oldNode.Name, err) } - if _, exists := ipsToKeep[nodeIP.String()]; !exists { - ipsToRemove = append(ipsToRemove, nodeIP) + + ipsToRemove := make([]net.IP, 0) + for _, nodeIP := range oldIPsv4 { + if _, exists := ipsToKeep[nodeIP.String()]; !exists { + ipsToRemove = append(ipsToRemove, nodeIP) + } + } + for _, nodeIP := range oldIPsv6 { + if _, exists := ipsToKeep[nodeIP.String()]; !exists { + ipsToRemove = append(ipsToRemove, nodeIP) + } } - } - if err := removePMTUDNodeNFTRules(ipsToRemove); err != nil { - return fmt.Errorf("error removing node %q stale NFT rules during update: %w", oldNode.Name, err) + if err := removePMTUDNodeNFTRules(ipsToRemove); err != nil { + return fmt.Errorf("error removing node %q stale NFT rules during update: %w", oldNode.Name, err) + } } return h.nc.addOrUpdateNode(newNode) diff --git a/go-controller/pkg/ovn/address_set/fake_address_set.go b/go-controller/pkg/ovn/address_set/fake_address_set.go index 2f783b3486..22db3c4a0f 100644 --- a/go-controller/pkg/ovn/address_set/fake_address_set.go +++ b/go-controller/pkg/ovn/address_set/fake_address_set.go @@ -11,6 +11,7 @@ import ( "k8s.io/klog/v2" utilnet "k8s.io/utils/net" + "github.com/ovn-kubernetes/libovsdb/client" "github.com/ovn-kubernetes/libovsdb/ovsdb" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" @@ -124,7 +125,7 @@ func (f *FakeAddressSetFactory) GetAddressSet(dbIDs *libovsdbops.DbObjectIDs) (A if ok { return set, nil } - return nil, fmt.Errorf("error fetching address set") + return nil, fmt.Errorf("error fetching address set: %w", client.ErrNotFound) } func (f *FakeAddressSetFactory) ProcessEachAddressSet(ownerController string, indexT *libovsdbops.ObjectIDsType, iteratorFn AddressSetIterFunc) error { diff --git a/go-controller/pkg/ovn/base_network_controller_secondary.go b/go-controller/pkg/ovn/base_network_controller_secondary.go index f9c6d0b18f..7ba474901e 100644 --- a/go-controller/pkg/ovn/base_network_controller_secondary.go +++ b/go-controller/pkg/ovn/base_network_controller_secondary.go @@ -28,6 +28,8 @@ import ( libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" + addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/controller/udnenabledsvc" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/persistentips" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" @@ -812,7 +814,7 @@ func (oc *BaseSecondaryNetworkController) allowPersistentIPs() bool { // buildUDNEgressSNAT is used to build the conditional SNAT required on L3 and L2 UDNs to // steer traffic correctly via mp0 when leaving OVN to the host -func (bsnc *BaseSecondaryNetworkController) buildUDNEgressSNAT(localPodSubnets []*net.IPNet, outputPort string) ([]*nbdb.NAT, error) { +func (bsnc *BaseSecondaryNetworkController) buildUDNEgressSNAT(localPodSubnets []*net.IPNet, outputPort string, isUDNAdvertised bool) ([]*nbdb.NAT, error) { if len(localPodSubnets) == 0 { return nil, nil // nothing to do } @@ -822,16 +824,46 @@ func (bsnc *BaseSecondaryNetworkController) buildUDNEgressSNAT(localPodSubnets [ networkID := bsnc.GetNetworkID() // calculate MAC dstMac := util.IPAddrToHWAddr(util.GetNodeManagementIfAddr(localPodSubnets[0]).IP) + dstMacMatch := getMasqueradeManagementIPSNATMatch(dstMac.String()) extIDs := map[string]string{ types.NetworkExternalID: bsnc.GetNetworkName(), types.TopologyExternalID: bsnc.TopologyType(), } + + var nodeIPsAS, svcIPsAS addressset.AddressSet + if isUDNAdvertised { + // For advertised networks, we need to SNAT any traffic leaving the + // pods from these networks towards the node IPs in the cluster. In + // order to do such a conditional SNAT, we need an address set that + // contains the node IPs in the cluster. Given that egressIP feature + // already has an address set containing these nodeIPs owned by the + // default network controller, let's re-use it. + nodeIPsASIDs := getEgressIPAddrSetDbIDs(NodeIPAddrSetName, types.DefaultNetworkName, DefaultNetworkControllerName) + nodeIPsAS, err = bsnc.addressSetFactory.GetAddressSet(nodeIPsASIDs) + if err != nil { + return nil, fmt.Errorf("failed to get address set with IDs %v: %w", nodeIPsASIDs, err) + } + + // We also need to SNAT any traffic leaving the pods from these + // networks towards the default network service cluster IPs + // accessible from UDNs: we want the reply traffic to hit the + // masquerade IP rule rather than the UDN subnet ip rule to allow + // for overlaps in VRF-Lite configurations + svcIPsASIDs := udnenabledsvc.GetAddressSetDBIDs() + svcIPsAS, err = bsnc.addressSetFactory.GetAddressSet(svcIPsASIDs) + if err != nil { + return nil, fmt.Errorf("failed to get address set with IDs %v: %w", svcIPsASIDs, err) + } + } + for _, localPodSubnet := range localPodSubnets { + snatMatch := dstMacMatch + ipFamily := utilnet.IPv4 + masqIP, err = udn.AllocateV4MasqueradeIPs(networkID) if utilnet.IsIPv6CIDR(localPodSubnet) { masqIP, err = udn.AllocateV6MasqueradeIPs(networkID) - } else { - masqIP, err = udn.AllocateV4MasqueradeIPs(networkID) + ipFamily = utilnet.IPv6 } if err != nil { return nil, err @@ -839,12 +871,55 @@ func (bsnc *BaseSecondaryNetworkController) buildUDNEgressSNAT(localPodSubnets [ if masqIP == nil { return nil, fmt.Errorf("masquerade IP cannot be empty network %s (%d): %v", bsnc.GetNetworkName(), networkID, err) } - snats = append(snats, libovsdbops.BuildSNATWithMatch(&masqIP.ManagementPort.IP, localPodSubnet, outputPort, - extIDs, getMasqueradeManagementIPSNATMatch(dstMac.String()))) + + if isUDNAdvertised { + additionalSNATMatch := getClusterNodesDestinationBasedSNATMatch(ipFamily, nodeIPsAS, svcIPsAS) + if additionalSNATMatch != "" { + snatMatch = fmt.Sprintf("%s && %s", snatMatch, additionalSNATMatch) + } + } + + snat := libovsdbops.BuildSNATWithMatch( + &masqIP.ManagementPort.IP, + localPodSubnet, + outputPort, + extIDs, + snatMatch, + ) + snats = append(snats, snat) } + return snats, nil } +func getMasqueradeManagementIPSNATMatch(dstMac string) string { + return fmt.Sprintf("eth.dst == %s", dstMac) +} + +// getClusterNodesDestinationBasedSNATMatch creates destination-based SNAT match +// for the specified IP family. Returns an empty string if there is no address +// set for the provided IP family. +func getClusterNodesDestinationBasedSNATMatch(ipFamily utilnet.IPFamily, addressSets ...addressset.AddressSet) string { + asMatches := make([]string, 0, len(addressSets)) + for _, as := range addressSets { + asIPv4, asIPv6 := as.GetASHashNames() + switch { + case ipFamily == utilnet.IPv4 && asIPv4 != "": + asMatches = append(asMatches, fmt.Sprintf("ip4.dst == $%s", asIPv4)) + case ipFamily == utilnet.IPv6 && asIPv6 != "": + asMatches = append(asMatches, fmt.Sprintf("ip6.dst == $%s", asIPv6)) + } + } + switch len(asMatches) { + case 0: + return "" + case 1: + return asMatches[0] + default: + return fmt.Sprintf("(%s)", strings.Join(asMatches, " || ")) + } +} + func (bsnc *BaseSecondaryNetworkController) ensureDHCP(pod *corev1.Pod, podAnnotation *util.PodAnnotation, lsp *nbdb.LogicalSwitchPort) error { opts := []kubevirt.DHCPConfigsOpt{} @@ -867,10 +942,6 @@ func (bsnc *BaseSecondaryNetworkController) ensureDHCP(pod *corev1.Pod, podAnnot return kubevirt.EnsureDHCPOptionsForLSP(bsnc.controllerName, bsnc.nbClient, pod, podAnnotation.IPs, lsp, opts...) } -func getMasqueradeManagementIPSNATMatch(dstMac string) string { - return fmt.Sprintf("eth.dst == %s", dstMac) -} - func (bsnc *BaseSecondaryNetworkController) requireDHCP(pod *corev1.Pod) bool { // Configure DHCP only for kubevirt VMs layer2 primary udn with subnets return kubevirt.IsPodOwnedByVirtualMachine(pod) && diff --git a/go-controller/pkg/ovn/egressgw.go b/go-controller/pkg/ovn/egressgw.go index b607a3b253..d9d8610aba 100644 --- a/go-controller/pkg/ovn/egressgw.go +++ b/go-controller/pkg/ovn/egressgw.go @@ -589,7 +589,7 @@ func (oc *DefaultNetworkController) deletePodSNAT(nodeName string, extIPs, podIP return nil } // Default network does not set any matches in Pod SNAT - ops, err := deletePodSNATOps(oc.nbClient, nil, oc.GetNetworkScopedGWRouterName(nodeName), extIPs, podIPNets, "") + ops, err := deletePodSNATOps(oc.nbClient, nil, oc.GetNetworkScopedGWRouterName(nodeName), extIPs, podIPNets) if err != nil { return err } @@ -639,8 +639,8 @@ func getExternalIPsGR(watchFactory *factory.WatchFactory, nodeName string) ([]*n // deletePodSNATOps creates ovsdb operation that removes per pod SNAT rules towards the nodeIP that are applied to the GR where the pod resides // used when disableSNATMultipleGWs=true -func deletePodSNATOps(nbClient libovsdbclient.Client, ops []ovsdb.Operation, gwRouterName string, extIPs, podIPNets []*net.IPNet, match string) ([]ovsdb.Operation, error) { - nats, err := buildPodSNAT(extIPs, podIPNets, match) +func deletePodSNATOps(nbClient libovsdbclient.Client, ops []ovsdb.Operation, gwRouterName string, extIPs, podIPNets []*net.IPNet) ([]ovsdb.Operation, error) { + nats, err := buildPodSNAT(extIPs, podIPNets, "") // for delete, match is not needed - we try to cleanup all the SNATs that match the isEquivalentNAT predicate if err != nil { return nil, err } @@ -657,7 +657,7 @@ func deletePodSNATOps(nbClient libovsdbclient.Client, ops []ovsdb.Operation, gwR // addOrUpdatePodSNAT adds or updates per pod SNAT rules towards the nodeIP that are applied to the GR where the pod resides // used when disableSNATMultipleGWs=true func addOrUpdatePodSNAT(nbClient libovsdbclient.Client, gwRouterName string, extIPs, podIfAddrs []*net.IPNet) error { - ops, err := addOrUpdatePodSNATOps(nbClient, gwRouterName, extIPs, podIfAddrs, nil) + ops, err := addOrUpdatePodSNATOps(nbClient, gwRouterName, extIPs, podIfAddrs, "", nil) if err != nil { return err } @@ -670,9 +670,9 @@ func addOrUpdatePodSNAT(nbClient libovsdbclient.Client, gwRouterName string, ext // addOrUpdatePodSNATOps returns the operation that adds or updates per pod SNAT rules towards the nodeIP that are // applied to the GR where the pod resides // used when disableSNATMultipleGWs=true -func addOrUpdatePodSNATOps(nbClient libovsdbclient.Client, gwRouterName string, extIPs, podIfAddrs []*net.IPNet, ops []ovsdb.Operation) ([]ovsdb.Operation, error) { +func addOrUpdatePodSNATOps(nbClient libovsdbclient.Client, gwRouterName string, extIPs, podIfAddrs []*net.IPNet, snatMatch string, ops []ovsdb.Operation) ([]ovsdb.Operation, error) { gwRouter := &nbdb.LogicalRouter{Name: gwRouterName} - nats, err := buildPodSNAT(extIPs, podIfAddrs, "") + nats, err := buildPodSNAT(extIPs, podIfAddrs, snatMatch) if err != nil { return nil, err } diff --git a/go-controller/pkg/ovn/egressip.go b/go-controller/pkg/ovn/egressip.go index 5f50cefb95..ed018c0de3 100644 --- a/go-controller/pkg/ovn/egressip.go +++ b/go-controller/pkg/ovn/egressip.go @@ -249,7 +249,7 @@ func NewEIPController( // CASE 3.4: Both Namespace && Pod Selectors on Spec changed // } // -// NOTE: `Spec.EgressIPs“ updates for EIP object are not processed here, that is the job of cluster manager +// NOTE: `Spec.EgressIPs" updates for EIP object are not processed here, that is the job of cluster manager // // We only care about `Spec.NamespaceSelector`, `Spec.PodSelector` and `Status` field func (e *EgressIPController) reconcileEgressIP(old, new *egressipv1.EgressIP) (err error) { @@ -2594,9 +2594,21 @@ func (e *EgressIPController) addExternalGWPodSNATOps(ni util.NetInfo, ops []ovsd if err != nil { return nil, err } - ops, err = addOrUpdatePodSNATOps(e.nbClient, ni.GetNetworkScopedGWRouterName(pod.Spec.NodeName), extIPs, podIPs, ops) - if err != nil { - return nil, err + + // Handle each pod IP individually since each IP family needs its own SNAT match + for _, podIP := range podIPs { + ipFamily := utilnet.IPv4 + if utilnet.IsIPv6CIDR(podIP) { + ipFamily = utilnet.IPv6 + } + snatMatch, err := GetNetworkScopedClusterSubnetSNATMatch(e.nbClient, ni, pod.Spec.NodeName, util.IsPodNetworkAdvertisedAtNode(ni, pod.Spec.NodeName), ipFamily) + if err != nil { + return nil, fmt.Errorf("failed to get SNAT match for node %s for network %s: %w", pod.Spec.NodeName, ni.GetNetworkName(), err) + } + ops, err = addOrUpdatePodSNATOps(e.nbClient, ni.GetNetworkScopedGWRouterName(pod.Spec.NodeName), extIPs, []*net.IPNet{podIP}, snatMatch, ops) + if err != nil { + return nil, err + } } klog.V(5).Infof("Adding SNAT on %s since egress node managing %s/%s was the same: %s", pod.Spec.NodeName, pod.Namespace, pod.Name, status.Node) } @@ -2617,7 +2629,7 @@ func (e *EgressIPController) deleteExternalGWPodSNATOps(ni util.NetInfo, ops []o if err != nil { return nil, err } - ops, err = deletePodSNATOps(e.nbClient, ops, ni.GetNetworkScopedGWRouterName(pod.Spec.NodeName), extIPs, affectedIPs, "") + ops, err = deletePodSNATOps(e.nbClient, ops, ni.GetNetworkScopedGWRouterName(pod.Spec.NodeName), extIPs, affectedIPs) if err != nil { return nil, err } diff --git a/go-controller/pkg/ovn/gateway.go b/go-controller/pkg/ovn/gateway.go index a43adf5368..a200fea0b9 100644 --- a/go-controller/pkg/ovn/gateway.go +++ b/go-controller/pkg/ovn/gateway.go @@ -25,6 +25,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node" + addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/gateway" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/gatewayrouter" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" @@ -42,7 +43,6 @@ type GatewayManager struct { nbClient libovsdbclient.Client netInfo util.NetInfo watchFactory *factory.WatchFactory - // Cluster wide Load_Balancer_Group UUID. // Includes all node switches and node gateway routers. clusterLoadBalancerGroupUUID string @@ -143,8 +143,8 @@ func WithLoadBalancerGroups(routerLBGroup, clusterLBGroup, switchLBGroup string) } // cleanupStalePodSNATs removes pod SNATs against nodeIP for the given node if -// the SNAT.logicalIP isn't an active podIP, the pod network is being advertised -// on this node or disableSNATMultipleGWs=false. We don't have to worry about +// the SNAT.logicalIP isn't an active podIP, or disableSNATMultipleGWs=false. +// We don't have to worry about // missing SNATs that should be added because addLogicalPort takes care of this // for all pods when RequestRetryObjs is called for each node add. // Other non-pod SNATs like join subnet SNATs are ignored. @@ -154,11 +154,11 @@ func WithLoadBalancerGroups(routerLBGroup, clusterLBGroup, switchLBGroup string) // pod->nodeSNATs which won't get cleared up unless explicitly deleted. // NOTE2: egressIP SNATs are synced in EIP controller. func (gw *GatewayManager) cleanupStalePodSNATs(nodeName string, nodeIPs []*net.IPNet, gwLRPIPs []net.IP) error { - // collect all the pod IPs for which we should be doing the SNAT; if the pod - // network is advertised or DisableSNATMultipleGWs==false we consider all + // collect all the pod IPs for which we should be doing the SNAT; + // if DisableSNATMultipleGWs==false we consider all // the SNATs stale podIPsWithSNAT := sets.New[string]() - if !gw.isRoutingAdvertised(nodeName) && config.Gateway.DisableSNATMultipleGWs { + if config.Gateway.DisableSNATMultipleGWs { pods, err := gw.watchFactory.GetAllPods() if err != nil { return fmt.Errorf("unable to list existing pods on node: %s, %w", @@ -231,7 +231,6 @@ func (gw *GatewayManager) cleanupStalePodSNATs(nodeName string, nodeIPs []*net.I } natsToDelete = append(natsToDelete, routerNat) } - if len(natsToDelete) > 0 { err := libovsdbops.DeleteNATs(gw.nbClient, gatewayRouter, natsToDelete...) if err != nil { @@ -764,7 +763,9 @@ func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []* nats := make([]*nbdb.NAT, 0, len(clusterIPSubnet)) var nat *nbdb.NAT - if (!config.Gateway.DisableSNATMultipleGWs || gw.netInfo.IsPrimaryNetwork()) && !gw.isRoutingAdvertised(nodeName) { + // DisableSNATMultipleGWs is only applicable to cluster default network and not to user defined networks. + // For user defined networks, we always add SNAT rules regardless of whether the network is advertised or not. + if !config.Gateway.DisableSNATMultipleGWs || gw.netInfo.IsPrimaryNetwork() { // Default SNAT rules. DisableSNATMultipleGWs=false in LGW (traffic egresses via mp0) always. // We are not checking for gateway mode to be shared explicitly to reduce topology differences. for _, entry := range clusterIPSubnet { @@ -774,7 +775,17 @@ func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []* gw.gwRouterName, err) } - nat = libovsdbops.BuildSNATWithMatch(&externalIP[0], entry, "", extIDs, gw.netInfo.GetNetworkScopedClusterSubnetSNATMatch(nodeName)) + // Get the match for this specific subnet's IP family + ipFamily := utilnet.IPv4 + if utilnet.IsIPv6CIDR(entry) { + ipFamily = utilnet.IPv6 + } + snatMatch, err := GetNetworkScopedClusterSubnetSNATMatch(gw.nbClient, gw.netInfo, nodeName, gw.isRoutingAdvertised(nodeName), ipFamily) + if err != nil { + return fmt.Errorf("failed to get SNAT match for node %s for network %s: %w", nodeName, gw.netInfo.GetNetworkName(), err) + } + + nat = libovsdbops.BuildSNATWithMatch(&externalIP[0], entry, "", extIDs, snatMatch) nats = append(nats, nat) } err = libovsdbops.CreateOrUpdateNATs(gw.nbClient, gwRouter, nats...) @@ -784,7 +795,7 @@ func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []* } else { // ensure we do not have any leftover SNAT entries after an upgrade for _, logicalSubnet := range clusterIPSubnet { - nat = libovsdbops.BuildSNATWithMatch(nil, logicalSubnet, "", extIDs, gw.netInfo.GetNetworkScopedClusterSubnetSNATMatch(nodeName)) + nat = libovsdbops.BuildSNAT(nil, logicalSubnet, "", extIDs) nats = append(nats, nat) } err = libovsdbops.DeleteNATs(gw.nbClient, gwRouter, nats...) @@ -902,6 +913,39 @@ func (gw *GatewayManager) gatewayInit( return nil } +// GetNetworkScopedClusterSubnetSNATMatch returns the match for the SNAT rule for the cluster default network +// and the match for the SNAT rule for the L3/L2 user defined network. +// If the network is not advertised: +// - For Layer2 topology, the match is the output port of the GR to the join switch since in L2 there is only 1 router but two cSNATs. +// - For Layer3 topology, the match is empty. +// If the network is advertised: +// - For Layer2 topology, the match is the output port of the GR to the join switch and the destination must be a nodeIP in the cluster. +// - For Layer3 topology, the match is the destination must be a nodeIP in the cluster. +func GetNetworkScopedClusterSubnetSNATMatch(nbClient libovsdbclient.Client, netInfo util.NetInfo, nodeName string, isNetworkAdvertised bool, ipFamily utilnet.IPFamily) (string, error) { + if !isNetworkAdvertised { + if netInfo.TopologyType() != types.Layer2Topology { + return "", nil + } + return fmt.Sprintf("outport == %q", types.GWRouterToExtSwitchPrefix+netInfo.GetNetworkScopedGWRouterName(nodeName)), nil + } + + // if the network is advertised, we need to ensure that the SNAT exists with the correct conditional destination match + dbIDs := getEgressIPAddrSetDbIDs(NodeIPAddrSetName, types.DefaultNetworkName, DefaultNetworkControllerName) + addressSetFactory := addressset.NewOvnAddressSetFactory(nbClient, config.IPv4Mode, config.IPv6Mode) + addrSet, err := addressSetFactory.GetAddressSet(dbIDs) + if err != nil { + return "", fmt.Errorf("cannot ensure that addressSet %v exists: %w", dbIDs, err) + } + destinationMatch := getClusterNodesDestinationBasedSNATMatch(ipFamily, addrSet) + if destinationMatch == "" { + return "", fmt.Errorf("could not build a destination based SNAT match because no addressSet %v exists for IP family %v", dbIDs, ipFamily) + } + if netInfo.TopologyType() != types.Layer2Topology { + return destinationMatch, nil + } + return fmt.Sprintf("outport == %q && %s", types.GWRouterToExtSwitchPrefix+netInfo.GetNetworkScopedGWRouterName(nodeName), destinationMatch), nil +} + // addExternalSwitch creates a switch connected to the external bridge and connects it to // the gateway router func (gw *GatewayManager) addExternalSwitch(prefix, interfaceID, gatewayRouter, macAddress, physNetworkName string, ipAddresses []*net.IPNet, vlanID *uint) error { diff --git a/go-controller/pkg/ovn/gateway_test.go b/go-controller/pkg/ovn/gateway_test.go index 61f89e831d..893d17ad09 100644 --- a/go-controller/pkg/ovn/gateway_test.go +++ b/go-controller/pkg/ovn/gateway_test.go @@ -65,6 +65,15 @@ func generateGatewayInitExpectedNB(testData []libovsdbtest.TestData, expectedOVN expectedNodeSwitch *nbdb.LogicalSwitch, nodeName string, clusterIPSubnets []*net.IPNet, hostSubnets []*net.IPNet, l3GatewayConfig *util.L3GatewayConfig, joinLRPIPs, defLRPIPs []*net.IPNet, skipSnat bool, nodeMgmtPortIP, gatewayMTU string) []libovsdbtest.TestData { + return generateGatewayInitExpectedNBWithPodNetworkAdvertised(testData, expectedOVNClusterRouter, expectedNodeSwitch, + nodeName, clusterIPSubnets, hostSubnets, l3GatewayConfig, joinLRPIPs, defLRPIPs, skipSnat, nodeMgmtPortIP, + gatewayMTU, false) // Default to no pod network advertised +} + +func generateGatewayInitExpectedNBWithPodNetworkAdvertised(testData []libovsdbtest.TestData, expectedOVNClusterRouter *nbdb.LogicalRouter, + expectedNodeSwitch *nbdb.LogicalSwitch, nodeName string, clusterIPSubnets []*net.IPNet, hostSubnets []*net.IPNet, + l3GatewayConfig *util.L3GatewayConfig, joinLRPIPs, defLRPIPs []*net.IPNet, skipSnat bool, nodeMgmtPortIP, + gatewayMTU string, isPodNetworkAdvertised bool) []libovsdbtest.TestData { GRName := "GR_" + nodeName gwSwitchPort := types.JoinSwitchToGWRouterPrefix + GRName @@ -214,6 +223,16 @@ func generateGatewayInitExpectedNB(testData []libovsdbtest.TestData, expectedOVN }, Networks: networks, }) + var egressNodeIPsASv4, egressNodeIPsASv6 *nbdb.AddressSet + if config.OVNKubernetesFeature.EnableEgressIP { + egressNodeIPsASv4, egressNodeIPsASv6 = buildEgressIPNodeAddressSets(physicalIPs) + if config.IPv4Mode { + testData = append(testData, egressNodeIPsASv4) + } + if config.IPv6Mode { + testData = append(testData, egressNodeIPsASv6) + } + } natUUIDs := make([]string, 0, len(clusterIPSubnets)) if !skipSnat { @@ -231,6 +250,19 @@ func generateGatewayInitExpectedNB(testData []libovsdbtest.TestData, expectedOVN if config.Gateway.Mode != config.GatewayModeDisabled { nat.ExternalPortRange = config.DefaultEphemeralPortRange } + if isPodNetworkAdvertised { + // IPv6 pod network + if utilnet.IsIPv6CIDR(subnet) { + if egressNodeIPsASv6 != nil { + nat.Match = fmt.Sprintf("ip6.dst == $%s", egressNodeIPsASv6.Name) + } + } else { + // IPv4 pod network + if egressNodeIPsASv4 != nil { + nat.Match = fmt.Sprintf("ip4.dst == $%s", egressNodeIPsASv4.Name) + } + } + } testData = append(testData, &nat) } } diff --git a/go-controller/pkg/ovn/master_test.go b/go-controller/pkg/ovn/master_test.go index bc943ecf8b..cb1dffb847 100644 --- a/go-controller/pkg/ovn/master_test.go +++ b/go-controller/pkg/ovn/master_test.go @@ -963,6 +963,7 @@ var _ = ginkgo.Describe("Default network controller operations", func() { // Restore global default values before each testcase gomega.Expect(config.PrepareTestConfig()).To(gomega.Succeed()) fakeOvn = NewFakeOVN(true) + config.OVNKubernetesFeature.EnableEgressIP = true app = cli.NewApp() app.Name = "test" @@ -1043,6 +1044,19 @@ var _ = ginkgo.Describe("Default network controller operations", func() { l3GatewayConfig = node1.gatewayConfig(config.GatewayModeLocal, uint(vlanID)) err = util.SetL3GatewayConfig(nodeAnnotator, l3GatewayConfig) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if config.OVNKubernetesFeature.EnableEgressIP { + physicalIPs := []string{} + for _, ip := range l3GatewayConfig.IPAddresses { + physicalIPs = append(physicalIPs, ip.IP.String()) + } + egressNodeIPsASv4, egressNodeIPsASv6 := buildEgressIPNodeAddressSets(physicalIPs) + if config.IPv4Mode { + dbSetup.NBData = append(dbSetup.NBData, egressNodeIPsASv4) + } + if config.IPv6Mode { + dbSetup.NBData = append(dbSetup.NBData, egressNodeIPsASv6) + } + } err = util.UpdateNodeManagementPortMACAddresses(&testNode, nodeAnnotator, ovntest.MustParseMAC(node1.NodeMgmtPortMAC), types.DefaultNetworkName) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1245,10 +1259,18 @@ var _ = ginkgo.Describe("Default network controller operations", func() { newNodeSNAT("stale-nodeNAT-UUID-3", "10.0.0.3", Node1GatewayRouterIP), newNodeSNAT("stale-nodeNAT-UUID-4", "10.0.0.3", "172.16.16.3"), } + extraNatsWithMatch := []*nbdb.NAT{ // used for pod network advertised test + newNodeSNATWithMatch("stale-nodeNAT-UUID-1", "10.1.0.3", Node1GatewayRouterIP, "ip4.dst == $a712973235162149816"), + newNodeSNATWithMatch("stale-nodeNAT-UUID-2", "10.2.0.3", Node1GatewayRouterIP, "ip4.dst == $a712973235162149816"), + newNodeSNATWithMatch("stale-nodeNAT-UUID-3", "10.0.0.3", Node1GatewayRouterIP, "ip4.dst == $a712973235162149816"), + newNodeSNATWithMatch("stale-nodeNAT-UUID-4", "10.0.0.3", "172.16.16.3", "ip4.dst == $a712973235162149816"), + } ginkgo.DescribeTable( "reconciles pod network SNATs from syncGateway", func(condition func(*DefaultNetworkController) error, expectedExtraNATs ...*nbdb.NAT) { + config.OVNKubernetesFeature.AdvertisedUDNIsolationMode = config.AdvertisedUDNIsolationModeStrict app.Action = func(ctx *cli.Context) error { + // Initialize config from CLI flags (including --init-gateways) _, err := config.InitConfig(ctx, nil, nil) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1258,6 +1280,10 @@ var _ = ginkgo.Describe("Default network controller operations", func() { _, err = fakeClient.KubeClient.CoreV1().Pods(ns.Name).Create(context.TODO(), &pod, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // generate specific test conditions (after base config is set) + err = condition(oc) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // Let the real code run and ensure OVN database sync gomega.Expect(oc.WatchNodes()).To(gomega.Succeed()) @@ -1265,11 +1291,11 @@ var _ = ginkgo.Describe("Default network controller operations", func() { GR := &nbdb.LogicalRouter{ Name: types.GWRouterPrefix + node1.Name, } - err = libovsdbops.CreateOrUpdateNATs(nbClient, GR, extraNats...) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // generate specific test conditions - err = condition(oc) + if !oc.isPodNetworkAdvertisedAtNode(node1.Name) { + err = libovsdbops.CreateOrUpdateNATs(nbClient, GR, extraNats...) + } else { + err = libovsdbops.CreateOrUpdateNATs(nbClient, GR, extraNatsWithMatch...) + } gomega.Expect(err).NotTo(gomega.HaveOccurred()) // ensure the stale SNAT's are cleaned up @@ -1281,19 +1307,23 @@ var _ = ginkgo.Describe("Default network controller operations", func() { err = oc.syncNodeGateway(testNode) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - skipSnat := config.Gateway.DisableSNATMultipleGWs || oc.isPodNetworkAdvertisedAtNode(node1.Name) + skipSnat := config.Gateway.DisableSNATMultipleGWs && !oc.GetNetInfo().IsPrimaryNetwork() var clusterSubnets []*net.IPNet for _, clusterSubnet := range config.Default.ClusterSubnets { clusterSubnets = append(clusterSubnets, clusterSubnet.CIDR) } expectedNBDatabaseState = addNodeLogicalFlowsWithServiceController(nil, expectedOVNClusterRouter, expectedNodeSwitch, expectedClusterRouterPortGroup, expectedClusterPortGroup, &node1, oc.svcTemplateSupport) - expectedNBDatabaseState = generateGatewayInitExpectedNB(expectedNBDatabaseState, expectedOVNClusterRouter, - expectedNodeSwitch, node1.Name, clusterSubnets, []*net.IPNet{subnet}, l3GatewayConfig, - []*net.IPNet{classBIPAddress(node1.LrpIP)}, []*net.IPNet{classBIPAddress(node1.DrLrpIP)}, - skipSnat, node1.NodeMgmtPortIP, "1400") - - if oc.isPodNetworkAdvertisedAtNode(node1.Name) { + if !oc.isPodNetworkAdvertisedAtNode(node1.Name) { + expectedNBDatabaseState = generateGatewayInitExpectedNB(expectedNBDatabaseState, expectedOVNClusterRouter, + expectedNodeSwitch, node1.Name, clusterSubnets, []*net.IPNet{subnet}, l3GatewayConfig, + []*net.IPNet{classBIPAddress(node1.LrpIP)}, []*net.IPNet{classBIPAddress(node1.DrLrpIP)}, + skipSnat, node1.NodeMgmtPortIP, "1400") + } else { + expectedNBDatabaseState = generateGatewayInitExpectedNBWithPodNetworkAdvertised(expectedNBDatabaseState, expectedOVNClusterRouter, + expectedNodeSwitch, node1.Name, clusterSubnets, []*net.IPNet{subnet}, l3GatewayConfig, + []*net.IPNet{classBIPAddress(node1.LrpIP)}, []*net.IPNet{classBIPAddress(node1.DrLrpIP)}, + skipSnat, node1.NodeMgmtPortIP, "1400", true) addrSet, err := oc.addressSetFactory.GetAddressSet(GetAdvertisedNetworkSubnetsAddressSetDBIDs()) gomega.Expect(err).NotTo(gomega.HaveOccurred()) expectedNBDatabaseState = generateAdvertisedUDNIsolationExpectedNB(expectedNBDatabaseState, oc.GetNetworkName(), oc.GetNetworkID(), clusterSubnets, expectedNodeSwitch, addrSet) @@ -1347,17 +1377,21 @@ var _ = ginkgo.Describe("Default network controller operations", func() { mutableNetInfo.SetPodNetworkAdvertisedVRFs(map[string][]string{"node1": {"vrf"}}) return oc.Reconcile(mutableNetInfo) }, - newNodeSNAT("stale-nodeNAT-UUID-4", "10.0.0.3", "172.16.16.3"), // won't be deleted on this node but will be deleted on the node whose IP is 172.16.16.3 since this pod belongs to this node + // won't be deleted on this node since this pod belongs to node-1 and is advertised so we keep this SNAT + newNodeSNATWithMatch("stale-nodeNAT-UUID-3", "10.0.0.3", Node1GatewayRouterIP, "ip4.dst == $a712973235162149816"), + // won't be deleted on this node but will be deleted on the node whose IP is 172.16.16.3 since this pod belongs to node-1 + newNodeSNATWithMatch("stale-nodeNAT-UUID-4", "10.0.0.3", "172.16.16.3", "ip4.dst == $a712973235162149816"), ), ginkgo.Entry( "When pod network is advertised and DisableSNATMultipleGWs is false", func(oc *DefaultNetworkController) error { config.Gateway.DisableSNATMultipleGWs = false + config.OVNKubernetesFeature.EnableEgressIP = true mutableNetInfo := util.NewMutableNetInfo(oc.GetNetInfo()) mutableNetInfo.SetPodNetworkAdvertisedVRFs(map[string][]string{"node1": {"vrf"}}) return oc.Reconcile(mutableNetInfo) }, - newNodeSNAT("stale-nodeNAT-UUID-4", "10.0.0.3", "172.16.16.3"), // won't be deleted on this node but will be deleted on the node whose IP is 172.16.16.3 since this pod belongs to this node + newNodeSNATWithMatch("stale-nodeNAT-UUID-4", "10.0.0.3", "172.16.16.3", "ip4.dst == $a712973235162149816"), // won't be deleted on this node but will be deleted on the node whose IP is 172.16.16.3 since this pod belongs to this node ), ) @@ -1962,6 +1996,12 @@ func newNodeSNAT(uuid, logicalIP, externalIP string) *nbdb.NAT { } } +func newNodeSNATWithMatch(uuid, logicalIP, externalIP, match string) *nbdb.NAT { + nat := newNodeSNAT(uuid, logicalIP, externalIP) + nat.Match = match + return nat +} + func TestController_syncNodes(t *testing.T) { gomega.RegisterFailHandler(ginkgo.Fail) diff --git a/go-controller/pkg/ovn/namespace.go b/go-controller/pkg/ovn/namespace.go index 01f189228b..07282de4df 100644 --- a/go-controller/pkg/ovn/namespace.go +++ b/go-controller/pkg/ovn/namespace.go @@ -8,10 +8,12 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" + utilnet "k8s.io/utils/net" "github.com/ovn-kubernetes/libovsdb/ovsdb" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" + libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" utilerrors "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/errors" @@ -234,9 +236,41 @@ func (oc *DefaultNetworkController) updateNamespace(old, newer *corev1.Namespace if err != nil { errors = append(errors, err) } else { - if extIPs, err := getExternalIPsGR(oc.watchFactory, pod.Spec.NodeName); err != nil { - errors = append(errors, err) - } else if err = addOrUpdatePodSNAT(oc.nbClient, oc.GetNetworkScopedGWRouterName(pod.Spec.NodeName), extIPs, podAnnotation.IPs); err != nil { + // Helper function to handle the complex SNAT operations + handleSNATOps := func() error { + extIPs, err := getExternalIPsGR(oc.watchFactory, pod.Spec.NodeName) + if err != nil { + return err + } + + var ops []ovsdb.Operation + // Handle each pod IP individually since each IP family needs its own SNAT match + for _, podIP := range podAnnotation.IPs { + ipFamily := utilnet.IPv4 + if utilnet.IsIPv6CIDR(podIP) { + ipFamily = utilnet.IPv6 + } + snatMatch, err := GetNetworkScopedClusterSubnetSNATMatch(oc.nbClient, oc.GetNetInfo(), pod.Spec.NodeName, oc.isPodNetworkAdvertisedAtNode(pod.Spec.NodeName), ipFamily) + if err != nil { + return fmt.Errorf("failed to get SNAT match for node %s for network %s: %v", pod.Spec.NodeName, oc.GetNetworkName(), err) + } + ops, err = addOrUpdatePodSNATOps(oc.nbClient, oc.GetNetworkScopedGWRouterName(pod.Spec.NodeName), extIPs, []*net.IPNet{podIP}, snatMatch, ops) + if err != nil { + return err + } + } + + // Execute all operations in a single transaction + if len(ops) > 0 { + _, err = libovsdbops.TransactAndCheck(oc.nbClient, ops) + if err != nil { + return fmt.Errorf("failed to update SNAT for pod %s on router %s: %v", pod.Name, oc.GetNetworkScopedGWRouterName(pod.Spec.NodeName), err) + } + } + return nil + } + + if err := handleSNATOps(); err != nil { errors = append(errors, err) } } diff --git a/go-controller/pkg/ovn/ovn.go b/go-controller/pkg/ovn/ovn.go index 07b7b6a83b..e75be5588e 100644 --- a/go-controller/pkg/ovn/ovn.go +++ b/go-controller/pkg/ovn/ovn.go @@ -399,7 +399,8 @@ func (oc *DefaultNetworkController) syncNodeGateway(node *corev1.Node) error { return fmt.Errorf("error creating gateway for node %s: %v", node.Name, err) } - if util.IsPodNetworkAdvertisedAtNode(oc, node.Name) { + if util.IsPodNetworkAdvertisedAtNode(oc, node.Name) && + config.OVNKubernetesFeature.AdvertisedUDNIsolationMode == config.AdvertisedUDNIsolationModeStrict { return oc.addAdvertisedNetworkIsolation(node.Name) } return oc.deleteAdvertisedNetworkIsolation(node.Name) diff --git a/go-controller/pkg/ovn/pods.go b/go-controller/pkg/ovn/pods.go index 5c3478f3cb..0ad9442e3e 100644 --- a/go-controller/pkg/ovn/pods.go +++ b/go-controller/pkg/ovn/pods.go @@ -12,6 +12,7 @@ import ( corev1 "k8s.io/api/core/v1" ktypes "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" + utilnet "k8s.io/utils/net" "github.com/ovn-kubernetes/libovsdb/ovsdb" @@ -310,13 +311,26 @@ func (oc *DefaultNetworkController) addLogicalPort(pod *corev1.Pod) (err error) if err != nil { return err } - } else if config.Gateway.DisableSNATMultipleGWs && !oc.isPodNetworkAdvertisedAtNode(pod.Spec.NodeName) { + } else if config.Gateway.DisableSNATMultipleGWs { // Add NAT rules to pods if disable SNAT is set and does not have // namespace annotations to go through external egress router if extIPs, err := getExternalIPsGR(oc.watchFactory, pod.Spec.NodeName); err != nil { return err - } else if ops, err = addOrUpdatePodSNATOps(oc.nbClient, oc.GetNetworkScopedGWRouterName(pod.Spec.NodeName), extIPs, podAnnotation.IPs, ops); err != nil { - return err + } else { + // Handle each pod IP individually since each IP family needs its own SNAT match + for _, podIP := range podAnnotation.IPs { + ipFamily := utilnet.IPv4 + if utilnet.IsIPv6CIDR(podIP) { + ipFamily = utilnet.IPv6 + } + snatMatch, err := GetNetworkScopedClusterSubnetSNATMatch(oc.nbClient, oc.GetNetInfo(), pod.Spec.NodeName, oc.isPodNetworkAdvertisedAtNode(pod.Spec.NodeName), ipFamily) + if err != nil { + return fmt.Errorf("failed to get SNAT match for node %s for network %s: %v", pod.Spec.NodeName, oc.GetNetworkName(), err) + } + if ops, err = addOrUpdatePodSNATOps(oc.nbClient, oc.GetNetworkScopedGWRouterName(pod.Spec.NodeName), extIPs, []*net.IPNet{podIP}, snatMatch, ops); err != nil { + return err + } + } } } diff --git a/go-controller/pkg/ovn/secondary_layer2_network_controller.go b/go-controller/pkg/ovn/secondary_layer2_network_controller.go index 7ce63fc278..b9cd878648 100644 --- a/go-controller/pkg/ovn/secondary_layer2_network_controller.go +++ b/go-controller/pkg/ovn/secondary_layer2_network_controller.go @@ -575,36 +575,39 @@ func (oc *SecondaryLayer2NetworkController) addUpdateLocalNodeEvent(node *corev1 gwManager := oc.gatewayManagerForNode(node.Name) oc.gatewayManagers.Store(node.Name, gwManager) - gwConfig, err := oc.nodeGatewayConfig(node) - if err != nil { - errs = append(errs, err) - oc.gatewaysFailed.Store(node.Name, true) - } else { + err := func() error { + gwConfig, err := oc.nodeGatewayConfig(node) + if err != nil { + return err + } if err := gwManager.SyncGateway( node, gwConfig, ); err != nil { - errs = append(errs, err) - oc.gatewaysFailed.Store(node.Name, true) - } else { - if !util.IsPodNetworkAdvertisedAtNode(oc, node.Name) { - err = oc.addUDNClusterSubnetEgressSNAT(gwConfig.hostSubnets, gwManager.gwRouterName) - if err == nil && util.IsRouteAdvertisementsEnabled() { - err = oc.deleteAdvertisedNetworkIsolation(node.Name) - } - } else { - err = oc.deleteUDNClusterSubnetEgressSNAT(gwConfig.hostSubnets, gwManager.gwRouterName) - if err == nil { - err = oc.addAdvertisedNetworkIsolation(node.Name) - } + return err + } + isUDNAdvertised := util.IsPodNetworkAdvertisedAtNode(oc, node.Name) + err = oc.addOrUpdateUDNClusterSubnetEgressSNAT(gwConfig.hostSubnets, gwManager.gwRouterName, isUDNAdvertised) + if err != nil { + return err + } + shouldIsolate := isUDNAdvertised && config.OVNKubernetesFeature.AdvertisedUDNIsolationMode == config.AdvertisedUDNIsolationModeStrict + if shouldIsolate { + if err = oc.addAdvertisedNetworkIsolation(node.Name); err != nil { + return err } - if err != nil { - errs = append(errs, err) - oc.gatewaysFailed.Store(node.Name, true) - } else { - oc.gatewaysFailed.Delete(node.Name) + } else { + if err = oc.deleteAdvertisedNetworkIsolation(node.Name); err != nil { + return err } } + oc.gatewaysFailed.Delete(node.Name) + return nil + }() + + if err != nil { + errs = append(errs, err) + oc.gatewaysFailed.Store(node.Name, true) } } @@ -741,7 +744,8 @@ func (oc *SecondaryLayer2NetworkController) deleteNodeEvent(node *corev1.Node) e return nil } -// addUDNClusterSubnetEgressSNAT adds the SNAT on each node's GR in L2 networks +// addOrUpdateUDNClusterSubnetEgressSNAT adds or updates the SNAT on each node's GR in L2 networks for each UDN +// Based on the isUDNAdvertised flag, the SNAT matches are slightly different // snat eth.dst == d6:cf:fd:2c:a6:44 169.254.0.12 10.128.0.0/14 // snat eth.dst == d6:cf:fd:2c:a6:44 169.254.0.12 2010:100:200::/64 // these SNATs are required for pod2Egress traffic in LGW mode and pod2SameNode traffic in SGW mode to function properly on UDNs @@ -751,9 +755,12 @@ func (oc *SecondaryLayer2NetworkController) deleteNodeEvent(node *corev1.Node) e // externalIP = "169.254.0.12"; which is the masqueradeIP for this L2 UDN // so all in all we want to condionally SNAT all packets that are coming from pods hosted on this node, // which are leaving via UDN's mpX interface to the UDN's masqueradeIP. -func (oc *SecondaryLayer2NetworkController) addUDNClusterSubnetEgressSNAT(localPodSubnets []*net.IPNet, gwRouterName string) error { +// If isUDNAdvertised is true, then we want to SNAT all packets that are coming from pods on this network +// leaving towards nodeIPs on the cluster to masqueradeIP. If network is advertise then the SNAT looks like this: +// "eth.dst == 0a:58:5d:5d:00:02 && (ip4.dst == $a712973235162149816)" "169.254.0.36" "93.93.0.0/16" +func (oc *SecondaryLayer2NetworkController) addOrUpdateUDNClusterSubnetEgressSNAT(localPodSubnets []*net.IPNet, gwRouterName string, isUDNAdvertised bool) error { outputPort := types.GWRouterToJoinSwitchPrefix + gwRouterName - nats, err := oc.buildUDNEgressSNAT(localPodSubnets, outputPort) + nats, err := oc.buildUDNEgressSNAT(localPodSubnets, outputPort, isUDNAdvertised) if err != nil { return err } @@ -770,25 +777,6 @@ func (oc *SecondaryLayer2NetworkController) addUDNClusterSubnetEgressSNAT(localP return nil } -func (oc *SecondaryLayer2NetworkController) deleteUDNClusterSubnetEgressSNAT(localPodSubnets []*net.IPNet, routerName string) error { - outputPort := types.GWRouterToJoinSwitchPrefix + routerName - nats, err := oc.buildUDNEgressSNAT(localPodSubnets, outputPort) - if err != nil { - return err - } - if len(nats) == 0 { - return nil // nothing to do - } - router := &nbdb.LogicalRouter{ - Name: routerName, - } - if err := libovsdbops.DeleteNATs(oc.nbClient, router, nats...); err != nil { - return fmt.Errorf("failed to delete SNAT for cluster on router: %q for network %q, error: %w", - routerName, oc.GetNetworkName(), err) - } - return nil -} - func (oc *SecondaryLayer2NetworkController) nodeGatewayConfig(node *corev1.Node) (*GatewayConfig, error) { l3GatewayConfig, err := util.ParseNodeL3GatewayAnnotation(node) if err != nil { diff --git a/go-controller/pkg/ovn/secondary_layer3_network_controller.go b/go-controller/pkg/ovn/secondary_layer3_network_controller.go index b2355b9100..11cdf4f53f 100644 --- a/go-controller/pkg/ovn/secondary_layer3_network_controller.go +++ b/go-controller/pkg/ovn/secondary_layer3_network_controller.go @@ -857,7 +857,8 @@ func (oc *SecondaryLayer3NetworkController) addUpdateRemoteNodeEvent(node *corev return err } -// addNodeSubnetEgressSNAT adds the SNAT on each node's ovn-cluster-router in L3 networks +// addOrUpdateUDNNodeSubnetEgressSNAT adds or updates the SNAT on each node's ovn-cluster-router in L3 networks for each UDN +// Based on the isUDNAdvertised flag, the SNAT matches are slightly different // snat eth.dst == d6:cf:fd:2c:a6:44 169.254.0.12 10.128.0.0/24 // snat eth.dst == d6:cf:fd:2c:a6:44 169.254.0.12 2010:100:200::/64 // these SNATs are required for pod2Egress traffic in LGW mode and pod2SameNode traffic in SGW mode to function properly on UDNs @@ -867,9 +868,12 @@ func (oc *SecondaryLayer3NetworkController) addUpdateRemoteNodeEvent(node *corev // externalIP = "169.254.0.12"; which is the masqueradeIP for this L3 UDN // so all in all we want to condionally SNAT all packets that are coming from pods hosted on this node, // which are leaving via UDN's mpX interface to the UDN's masqueradeIP. -func (oc *SecondaryLayer3NetworkController) addUDNNodeSubnetEgressSNAT(localPodSubnets []*net.IPNet, node *corev1.Node) error { +// If isUDNAdvertised is true, then we want to SNAT all packets that are coming from pods on this network +// leaving towards nodeIPs on the cluster to masqueradeIP. If network is advertise then the SNAT looks like this: +// "eth.dst == 0a:58:5d:5d:00:02 && (ip4.dst == $a712973235162149816)" "169.254.0.36" "93.93.0.0/24" +func (oc *SecondaryLayer3NetworkController) addOrUpdateUDNNodeSubnetEgressSNAT(localPodSubnets []*net.IPNet, node *corev1.Node, isUDNAdvertised bool) error { outputPort := types.RouterToSwitchPrefix + oc.GetNetworkScopedName(node.Name) - nats, err := oc.buildUDNEgressSNAT(localPodSubnets, outputPort) + nats, err := oc.buildUDNEgressSNAT(localPodSubnets, outputPort, isUDNAdvertised) if err != nil { return fmt.Errorf("failed to build UDN masquerade SNATs for network %q on node %q, err: %w", oc.GetNetworkName(), node.Name, err) @@ -887,28 +891,6 @@ func (oc *SecondaryLayer3NetworkController) addUDNNodeSubnetEgressSNAT(localPodS return nil } -// deleteUDNNodeSubnetEgressSNAT deletes SNAT rule from network specific -// ovn_cluster_router depending on whether the network is advertised or not -func (oc *SecondaryLayer3NetworkController) deleteUDNNodeSubnetEgressSNAT(localPodSubnets []*net.IPNet, node *corev1.Node) error { - outputPort := types.RouterToSwitchPrefix + oc.GetNetworkScopedName(node.Name) - nats, err := oc.buildUDNEgressSNAT(localPodSubnets, outputPort) - if err != nil { - return fmt.Errorf("failed to build UDN masquerade SNATs for network %q on node %q, err: %w", - oc.GetNetworkName(), node.Name, err) - } - if len(nats) == 0 { - return nil // nothing to do - } - router := &nbdb.LogicalRouter{ - Name: oc.GetNetworkScopedClusterRouterName(), - } - if err := libovsdbops.DeleteNATs(oc.nbClient, router, nats...); err != nil { - return fmt.Errorf("failed to delete SNAT for node subnet on router: %q for network %q, error: %w", - oc.GetNetworkScopedClusterRouterName(), oc.GetNetworkName(), err) - } - return nil -} - func (oc *SecondaryLayer3NetworkController) addNode(node *corev1.Node) ([]*net.IPNet, error) { // Node subnet for the secondary layer3 network is allocated by cluster manager. // Make sure that the node is allocated with the subnet before proceeding @@ -923,20 +905,17 @@ func (oc *SecondaryLayer3NetworkController) addNode(node *corev1.Node) ([]*net.I return nil, err } if util.IsNetworkSegmentationSupportEnabled() && oc.IsPrimaryNetwork() { - if !util.IsPodNetworkAdvertisedAtNode(oc, node.Name) { - if err := oc.addUDNNodeSubnetEgressSNAT(hostSubnets, node); err != nil { + isUDNAdvertised := util.IsPodNetworkAdvertisedAtNode(oc, node.Name) + if err := oc.addOrUpdateUDNNodeSubnetEgressSNAT(hostSubnets, node, isUDNAdvertised); err != nil { + return nil, err + } + shouldIsolate := isUDNAdvertised && config.OVNKubernetesFeature.AdvertisedUDNIsolationMode == config.AdvertisedUDNIsolationModeStrict + if shouldIsolate { + if err = oc.addAdvertisedNetworkIsolation(node.Name); err != nil { return nil, err } - if util.IsRouteAdvertisementsEnabled() { - if err := oc.deleteAdvertisedNetworkIsolation(node.Name); err != nil { - return nil, err - } - } } else { - if err := oc.deleteUDNNodeSubnetEgressSNAT(hostSubnets, node); err != nil { - return nil, err - } - if err := oc.addAdvertisedNetworkIsolation(node.Name); err != nil { + if err = oc.deleteAdvertisedNetworkIsolation(node.Name); err != nil { return nil, err } } diff --git a/go-controller/pkg/ovn/udn_isolation.go b/go-controller/pkg/ovn/udn_isolation.go index 0230f665b6..3403b0a9a7 100644 --- a/go-controller/pkg/ovn/udn_isolation.go +++ b/go-controller/pkg/ovn/udn_isolation.go @@ -371,38 +371,45 @@ func (bnc *BaseNetworkController) addAdvertisedNetworkIsolation(nodeName string) // It removes the network CIDRs from the global advertised networks addresset together with the ACLs on the node switch. func (bnc *BaseNetworkController) deleteAdvertisedNetworkIsolation(nodeName string) error { addrSet, err := bnc.addressSetFactory.GetAddressSet(GetAdvertisedNetworkSubnetsAddressSetDBIDs()) - if err != nil { + if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { return fmt.Errorf("failed to get advertised subnets addresset %s for network %s: %w", GetAdvertisedNetworkSubnetsAddressSetDBIDs(), bnc.GetNetworkName(), err) } - var cidrs []string - for _, subnet := range bnc.Subnets() { - cidrs = append(cidrs, subnet.CIDR.String()) - } - ops, err := addrSet.DeleteAddressesReturnOps(cidrs) - if err != nil { - return fmt.Errorf("failed to create ovsdb ops for deleting the addresses from %s addresset for network %s: %w", GetAdvertisedNetworkSubnetsAddressSetDBIDs(), bnc.GetNetworkName(), err) + var ops []ovsdb.Operation + if addrSet != nil { + var cidrs []string + for _, subnet := range bnc.Subnets() { + cidrs = append(cidrs, subnet.CIDR.String()) + } + ops, err = addrSet.DeleteAddressesReturnOps(cidrs) + if err != nil { + return fmt.Errorf("failed to create ovsdb ops for deleting the addresses from %s addresset for network %s: %w", GetAdvertisedNetworkSubnetsAddressSetDBIDs(), bnc.GetNetworkName(), err) + } } passACLIDs := GetAdvertisedNetworkSubnetsPassACLdbIDs(bnc.controllerName, bnc.GetNetworkName(), bnc.GetNetworkID()) + dropACLIDs := GetAdvertisedNetworkSubnetsDropACLdbIDs() passACLPredicate := libovsdbops.GetPredicate[*nbdb.ACL](passACLIDs, nil) - passACLs, err := libovsdbops.FindACLsWithPredicate(bnc.nbClient, passACLPredicate) - if err != nil { - return fmt.Errorf("unable to find the pass ACL for advertised network %s: %w", bnc.GetNetworkName(), err) + dropACLPredicate := libovsdbops.GetPredicate[*nbdb.ACL](dropACLIDs, nil) + // Create a combined predicate to find both ACLs in a single lookup + combinedACLPredicate := func(acl *nbdb.ACL) bool { + // Check if ACL matches either pass or drop ACL IDs + return passACLPredicate(acl) || dropACLPredicate(acl) } - dropACLIDs := GetAdvertisedNetworkSubnetsDropACLdbIDs() - dropACLPredicate := libovsdbops.GetPredicate[*nbdb.ACL](dropACLIDs, nil) - dropACLs, err := libovsdbops.FindACLsWithPredicate(bnc.nbClient, dropACLPredicate) + // Find both ACLs in a single lookup + allACLsToRemove, err := libovsdbops.FindACLsWithPredicate(bnc.nbClient, combinedACLPredicate) if err != nil { - return fmt.Errorf("unable to find the drop ACL for advertised network %s: %w", bnc.GetNetworkName(), err) + return fmt.Errorf("unable to find pass and/or drop ACLs for advertised network %s: %w", bnc.GetNetworkName(), err) } // ACLs referenced by the switch will be deleted by db if there are no other references p := func(sw *nbdb.LogicalSwitch) bool { return sw.Name == bnc.GetNetworkScopedSwitchName(nodeName) } - ops, err = libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicateOps(bnc.nbClient, ops, p, append(passACLs, dropACLs...)...) - if err != nil { - return fmt.Errorf("failed to create ovsdb ops for removing network isolation ACLs from the %s switch for network %s: %w", bnc.GetNetworkScopedSwitchName(nodeName), bnc.GetNetworkName(), err) + if len(allACLsToRemove) > 0 { + ops, err = libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicateOps(bnc.nbClient, ops, p, allACLsToRemove...) + if err != nil { + return fmt.Errorf("failed to create ovsdb ops for removing network isolation ACLs from the %s switch for network %s: %w", bnc.GetNetworkScopedSwitchName(nodeName), bnc.GetNetworkName(), err) + } } _, err = libovsdbops.TransactAndCheck(bnc.nbClient, ops) diff --git a/go-controller/pkg/types/const.go b/go-controller/pkg/types/const.go index 8ba7269cad..523da8e27b 100644 --- a/go-controller/pkg/types/const.go +++ b/go-controller/pkg/types/const.go @@ -312,13 +312,13 @@ const ( // CUDNPrefix of all CUDN network names CUDNPrefix = "cluster_udn_" - // NFTNoPMTUDRemoteNodeIPsv4 is a set used to track remote node IPs that do not belong to + // NFTRemoteNodeIPsv4 is a set used to track remote node v4IPs that do not belong to // the local node's subnet. - NFTNoPMTUDRemoteNodeIPsv4 = "no-pmtud-remote-node-ips-v4" + NFTRemoteNodeIPsv4 = "remote-node-ips-v4" - // NFTNoPMTUDRemoteNodeIPsv6 is a set used to track remote node IPs that do not belong to + // NFTRemoteNodeIPsv6 is a set used to track remote node v6IPs that do not belong to // the local node's subnet. - NFTNoPMTUDRemoteNodeIPsv6 = "no-pmtud-remote-node-ips-v6" + NFTRemoteNodeIPsv6 = "remote-node-ips-v6" // Metrics MetricOvnkubeNamespace = "ovnkube" diff --git a/go-controller/pkg/util/multi_network.go b/go-controller/pkg/util/multi_network.go index fd91edd3be..841ab001b8 100644 --- a/go-controller/pkg/util/multi_network.go +++ b/go-controller/pkg/util/multi_network.go @@ -82,7 +82,6 @@ type NetInfo interface { GetNetworkScopedExtPortName(bridgeID, nodeName string) string GetNetworkScopedLoadBalancerName(lbName string) string GetNetworkScopedLoadBalancerGroupName(lbGroupName string) string - GetNetworkScopedClusterSubnetSNATMatch(nodeName string) string // GetNetInfo is an identity method used to get the specific NetInfo // implementation @@ -543,10 +542,6 @@ func (nInfo *DefaultNetInfo) GetNetworkScopedLoadBalancerGroupName(lbGroupName s return nInfo.GetNetworkScopedName(lbGroupName) } -func (nInfo *DefaultNetInfo) GetNetworkScopedClusterSubnetSNATMatch(_ string) string { - return "" -} - func (nInfo *DefaultNetInfo) canReconcile(netInfo NetInfo) bool { _, ok := netInfo.(*DefaultNetInfo) return ok @@ -738,13 +733,6 @@ func (nInfo *secondaryNetInfo) GetNetworkScopedLoadBalancerGroupName(lbGroupName return nInfo.GetNetworkScopedName(lbGroupName) } -func (nInfo *secondaryNetInfo) GetNetworkScopedClusterSubnetSNATMatch(nodeName string) string { - if nInfo.TopologyType() != types.Layer2Topology { - return "" - } - return fmt.Sprintf("outport == %q", types.GWRouterToExtSwitchPrefix+nInfo.GetNetworkScopedGWRouterName(nodeName)) -} - // getPrefix returns if the logical entities prefix for this network func (nInfo *secondaryNetInfo) getPrefix() string { return GetSecondaryNetworkPrefix(nInfo.netName) diff --git a/helm/ovn-kubernetes/charts/ovnkube-control-plane/templates/ovnkube-control-plane.yaml b/helm/ovn-kubernetes/charts/ovnkube-control-plane/templates/ovnkube-control-plane.yaml index 465a0aa665..bf8a5eb67a 100644 --- a/helm/ovn-kubernetes/charts/ovnkube-control-plane/templates/ovnkube-control-plane.yaml +++ b/helm/ovn-kubernetes/charts/ovnkube-control-plane/templates/ovnkube-control-plane.yaml @@ -128,6 +128,8 @@ spec: value: {{ default "" .Values.global.enableNetworkSegmentation | quote }} - name: OVN_PRE_CONF_UDN_ADDR_ENABLE value: {{ default "" .Values.global.enablePreconfiguredUDNAddresses | quote }} + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: {{ default "strict" .Values.global.advertisedUDNIsolationMode | quote }} - name: OVN_HYBRID_OVERLAY_NET_CIDR value: {{ default "" .Values.global.hybridOverlayNetCidr | quote }} - name: OVN_DISABLE_SNAT_MULTIPLE_GWS diff --git a/helm/ovn-kubernetes/charts/ovnkube-master/templates/deployment-ovnkube-master.yaml b/helm/ovn-kubernetes/charts/ovnkube-master/templates/deployment-ovnkube-master.yaml index 5e8a48a47a..832b1bf913 100644 --- a/helm/ovn-kubernetes/charts/ovnkube-master/templates/deployment-ovnkube-master.yaml +++ b/helm/ovn-kubernetes/charts/ovnkube-master/templates/deployment-ovnkube-master.yaml @@ -236,6 +236,8 @@ spec: value: {{ hasKey .Values.global "enableMultiNetwork" | ternary .Values.global.enableMultiNetwork false | quote }} - name: OVN_NETWORK_SEGMENTATION_ENABLE value: {{ default "" .Values.global.enableNetworkSegmentation | quote }} + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: {{ default "strict" .Values.global.advertisedUDNIsolationMode | quote }} - name: OVN_EGRESSSERVICE_ENABLE value: {{ default "" .Values.global.enableEgressService | quote }} - name: OVN_HYBRID_OVERLAY_NET_CIDR diff --git a/helm/ovn-kubernetes/charts/ovnkube-node/templates/ovnkube-node.yaml b/helm/ovn-kubernetes/charts/ovnkube-node/templates/ovnkube-node.yaml index dbf6268f6a..4aaaa950b8 100644 --- a/helm/ovn-kubernetes/charts/ovnkube-node/templates/ovnkube-node.yaml +++ b/helm/ovn-kubernetes/charts/ovnkube-node/templates/ovnkube-node.yaml @@ -231,6 +231,8 @@ spec: value: {{ default "" .Values.global.enableNetworkSegmentation | quote }} - name: OVN_PRE_CONF_UDN_ADDR_ENABLE value: {{ default "" .Values.global.enablePreconfiguredUDNAddresses | quote }} + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: {{ default "strict" .Values.global.advertisedUDNIsolationMode | quote }} - name: OVN_ENABLE_INTERCONNECT value: {{ hasKey .Values.global "enableInterconnect" | ternary .Values.global.enableInterconnect false | quote }} - name: OVN_ENABLE_MULTI_EXTERNAL_GATEWAY diff --git a/helm/ovn-kubernetes/charts/ovnkube-single-node-zone/templates/ovnkube-single-node-zone.yaml b/helm/ovn-kubernetes/charts/ovnkube-single-node-zone/templates/ovnkube-single-node-zone.yaml index 2cd3913633..e6848b1773 100644 --- a/helm/ovn-kubernetes/charts/ovnkube-single-node-zone/templates/ovnkube-single-node-zone.yaml +++ b/helm/ovn-kubernetes/charts/ovnkube-single-node-zone/templates/ovnkube-single-node-zone.yaml @@ -416,6 +416,8 @@ spec: value: {{ default "" .Values.global.enableNetworkSegmentation | quote }} - name: OVN_PRE_CONF_UDN_ADDR_ENABLE value: {{ default "" .Values.global.enablePreconfiguredUDNAddresses | quote }} + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: {{ default "strict" .Values.global.advertisedUDNIsolationMode | quote }} - name: OVNKUBE_NODE_MGMT_PORT_NETDEV value: {{ default "" .Values.global.nodeMgmtPortNetdev | quote }} - name: OVN_EMPTY_LB_EVENTS diff --git a/helm/ovn-kubernetes/charts/ovnkube-zone-controller/templates/ovnkube-zone-controller.yaml b/helm/ovn-kubernetes/charts/ovnkube-zone-controller/templates/ovnkube-zone-controller.yaml index 3a437db089..4fb914439e 100644 --- a/helm/ovn-kubernetes/charts/ovnkube-zone-controller/templates/ovnkube-zone-controller.yaml +++ b/helm/ovn-kubernetes/charts/ovnkube-zone-controller/templates/ovnkube-zone-controller.yaml @@ -315,6 +315,8 @@ spec: value: {{ default "" .Values.global.enableNetworkSegmentation | quote }} - name: OVN_PRE_CONF_UDN_ADDR_ENABLE value: {{ default "" .Values.global.enablePreconfiguredUDNAddresses | quote }} + - name: OVN_ADVERTISED_UDN_ISOLATION_MODE + value: {{ default "strict" .Values.global.advertisedUDNIsolationMode | quote }} - name: OVN_HYBRID_OVERLAY_NET_CIDR value: {{ default "" .Values.global.hybridOverlayNetCidr | quote }} - name: OVN_DISABLE_SNAT_MULTIPLE_GWS diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go index e5bbde7d42..c837d10ded 100644 --- a/test/e2e/e2e.go +++ b/test/e2e/e2e.go @@ -2028,7 +2028,7 @@ var _ = ginkgo.Describe("e2e br-int flow monitoring export validation", func() { }) -func getNodePodCIDRs(nodeName string) (string, string, error) { +func getNodePodCIDRs(nodeName, netName string) (string, string, error) { // retrieve the pod cidr for the worker node jsonFlag := "jsonpath='{.metadata.annotations.k8s\\.ovn\\.org/node-subnets}'" kubectlOut, err := e2ekubectl.RunKubectl("default", "get", "node", nodeName, "-o", jsonFlag) @@ -2043,7 +2043,7 @@ func getNodePodCIDRs(nodeName string) (string, string, error) { ssSubnets := make(map[string]string) if err := json.Unmarshal([]byte(annotation), &ssSubnets); err == nil { // If only one subnet, determine if it's v4 or v6 - if subnet, ok := ssSubnets["default"]; ok { + if subnet, ok := ssSubnets[netName]; ok { if strings.Contains(subnet, ":") { ipv6CIDR = subnet } else { @@ -2055,7 +2055,7 @@ func getNodePodCIDRs(nodeName string) (string, string, error) { dsSubnets := make(map[string][]string) if err := json.Unmarshal([]byte(annotation), &dsSubnets); err == nil { - if subnets, ok := dsSubnets["default"]; ok && len(subnets) > 0 { + if subnets, ok := dsSubnets[netName]; ok && len(subnets) > 0 { // Classify each subnet as IPv4 or IPv6 for _, subnet := range subnets { if strings.Contains(subnet, ":") { @@ -2068,7 +2068,7 @@ func getNodePodCIDRs(nodeName string) (string, string, error) { } } - return "", "", fmt.Errorf("could not parse annotation %q", annotation) + return "", "", fmt.Errorf("could not parse annotation %q for network %s", annotation, netName) } var _ = ginkgo.Describe("e2e delete databases", func() { diff --git a/test/e2e/external_gateways.go b/test/e2e/external_gateways.go index c3b2f12198..628866910b 100644 --- a/test/e2e/external_gateways.go +++ b/test/e2e/external_gateways.go @@ -194,7 +194,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { framework.Logf("Annotating the external gateway test namespace to a container gw: %s ", exGWIpAlt1) e2ekubectl.RunKubectlOrDie(f.Namespace.Name, annotateArgs...) - podCIDR, _, err := getNodePodCIDRs(node.Name) + podCIDR, _, err := getNodePodCIDRs(node.Name, "default") if err != nil { framework.Failf("Error retrieving the pod cidr from %s %v", node.Name, err) } @@ -401,7 +401,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { nodeIP = primaryNetworkInf.IPv6 } framework.Logf("the pod side node is %s and the source node ip is %s", workerNodeInfo.name, nodeIP) - podCIDR, _, err := getNodePodCIDRs(workerNodeInfo.name) + podCIDR, _, err := getNodePodCIDRs(workerNodeInfo.name, "default") if err != nil { framework.Failf("Error retrieving the pod cidr from %s %v", workerNodeInfo.name, err) } @@ -857,7 +857,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { podConnEntriesWithMACLabelsSet := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) gomega.Expect(podConnEntriesWithMACLabelsSet).To(gomega.Equal(2)) totalPodConnEntries := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, nil) - gomega.Expect(totalPodConnEntries).To(gomega.Equal(6)) // total conntrack entries for this pod/protocol + gomega.Expect(totalPodConnEntries).To(gomega.Equal(4)) // total conntrack entries for this pod/protocol ginkgo.By("Remove second external gateway IP from the app namespace annotation") annotateNamespaceForGateway(f.Namespace.Name, false, addresses.gatewayIPs[0]) @@ -867,7 +867,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { totalPodConnEntries = pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, nil) gomega.Expect(podConnEntriesWithMACLabelsSet).To(gomega.Equal(1)) // we still have the conntrack entry for the remaining gateway - gomega.Expect(totalPodConnEntries).To(gomega.Equal(5)) // 6-1 + gomega.Expect(totalPodConnEntries).To(gomega.Equal(3)) // 4-1 ginkgo.By("Remove first external gateway IP from the app namespace annotation") annotateNamespaceForGateway(f.Namespace.Name, false, "") @@ -877,7 +877,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { totalPodConnEntries = pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, nil) gomega.Expect(podConnEntriesWithMACLabelsSet).To(gomega.Equal(0)) // we don't have any remaining gateways left - gomega.Expect(totalPodConnEntries).To(gomega.Equal(4)) // 6-2 + gomega.Expect(totalPodConnEntries).To(gomega.Equal(2)) // 4-2 }, ginkgo.Entry("IPV4 udp", &addressesv4, "udp"), @@ -938,7 +938,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { podConnEntriesWithMACLabelsSet := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) gomega.Expect(podConnEntriesWithMACLabelsSet).To(gomega.Equal(2)) totalPodConnEntries := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, nil) - gomega.Expect(totalPodConnEntries).To(gomega.Equal(6)) // total conntrack entries for this pod/protocol + gomega.Expect(totalPodConnEntries).To(gomega.Equal(4)) // total conntrack entries for this pod/protocol cleanUpFn := handleGatewayPodRemoval(f, removalType, gatewayPodName2, servingNamespace, addresses.gatewayIPs[1], true) if cleanUpFn != nil { @@ -960,7 +960,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { totalPodConnEntries = pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, nil) gomega.Expect(podConnEntriesWithMACLabelsSet).To(gomega.Equal(1)) // we still have the conntrack entry for the remaining gateway - gomega.Expect(totalPodConnEntries).To(gomega.Equal(5)) // 6-1 + gomega.Expect(totalPodConnEntries).To(gomega.Equal(3)) // 4-1 ginkgo.By("Remove first external gateway pod's routing-namespace annotation") annotatePodForGateway(gatewayPodName1, servingNamespace, "", addresses.gatewayIPs[0], false) @@ -980,7 +980,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { totalPodConnEntries = pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, nil) gomega.Expect(podConnEntriesWithMACLabelsSet).To(gomega.Equal(0)) // we don't have any remaining gateways left - gomega.Expect(totalPodConnEntries).To(gomega.Equal(4)) // 6-2 + gomega.Expect(totalPodConnEntries).To(gomega.Equal(2)) // 4-2 }, ginkgo.Entry("IPV4 udp + pod annotation update", &addressesv4, "udp", GatewayUpdate), ginkgo.Entry("IPV4 tcp + pod annotation update", &addressesv4, "tcp", GatewayUpdate), @@ -1960,7 +1960,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { ginkgo.By("Check if conntrack entries for ECMP routes are created for the 2 external gateways") nodeName := getPod(f, srcPodName).Spec.NodeName podConnEntriesWithMACLabelsSet := 2 - totalPodConnEntries := 6 + totalPodConnEntries := 4 gomega.Eventually(func() int { return pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) }, time.Minute, 5).Should(gomega.Equal(podConnEntriesWithMACLabelsSet)) @@ -1970,7 +1970,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { updateAPBExternalRouteCRWithStaticHop(defaultPolicyName, f.Namespace.Name, false, addresses.gatewayIPs[0]) podConnEntriesWithMACLabelsSet = 1 // we still have the conntrack entry for the remaining gateway - totalPodConnEntries = 5 // 6-1 + totalPodConnEntries = 3 // 4-1 gomega.Eventually(func() int { n := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) @@ -1985,7 +1985,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { ginkgo.By("Check if conntrack entries for ECMP routes are removed for the deleted external gateway if traffic is UDP") podConnEntriesWithMACLabelsSet = 0 // we don't have any remaining gateways left - totalPodConnEntries = 4 // 6-2 + totalPodConnEntries = 2 // 4-2 gomega.Eventually(func() int { n := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) @@ -2038,7 +2038,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { ginkgo.By("Check if conntrack entries for ECMP routes are created for the 2 external gateways") nodeName := getPod(f, srcPodName).Spec.NodeName podConnEntriesWithMACLabelsSet := 2 // TCP - totalPodConnEntries := 6 // TCP + totalPodConnEntries := 4 // TCP gomega.Eventually(func() int { n := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) @@ -2055,7 +2055,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { ginkgo.By("Check if conntrack entries for ECMP routes are removed for the deleted external gateway if traffic is UDP") podConnEntriesWithMACLabelsSet = 1 // we still have the conntrack entry for the remaining gateway - totalPodConnEntries = 5 // 6-1 + totalPodConnEntries = 3 // 4-1 gomega.Eventually(func() int { n := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) @@ -2072,7 +2072,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { ginkgo.By("Check if conntrack entries for ECMP routes are removed for the deleted external gateway if traffic is UDP") podConnEntriesWithMACLabelsSet = 0 //we don't have any remaining gateways left - totalPodConnEntries = 4 + totalPodConnEntries = 2 gomega.Eventually(func() int { n := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) klog.Infof("Number of entries with macAddressGW %s:%d", macAddressGW, n) @@ -2776,9 +2776,9 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { } nodeName := getPod(f, srcPodName).Spec.NodeName - expectedTotalEntries := 6 + expectedTotalEntries := 4 expectedMACEntries := 2 - ginkgo.By("Check to ensure initial conntrack entries are 2 mac address label, and 6 total entries") + ginkgo.By("Check to ensure initial conntrack entries are 2 mac address label, and 4 total entries") gomega.Eventually(func() int { n := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) klog.Infof("Number of entries with macAddressGW %s:%d", macAddressGW, n) @@ -2848,7 +2848,7 @@ var _ = ginkgo.Describe("External Gateway", feature.ExternalGateway, func() { podConnEntriesWithMACLabelsSet := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, macAddressGW) gomega.Expect(podConnEntriesWithMACLabelsSet).To(gomega.Equal(2)) totalPodConnEntries := pokeConntrackEntries(nodeName, addresses.srcPodIP, protocol, nil) - gomega.Expect(totalPodConnEntries).To(gomega.Equal(6)) // total conntrack entries for this pod/protocol + gomega.Expect(totalPodConnEntries).To(gomega.Equal(4)) // total conntrack entries for this pod/protocol checkAPBExternalRouteStatus(defaultPolicyName) }, ginkgo.Entry("IPV4 udp", &addressesv4, "udp"), diff --git a/test/e2e/route_advertisements.go b/test/e2e/route_advertisements.go index f65dd60631..5f9e53a8cc 100644 --- a/test/e2e/route_advertisements.go +++ b/test/e2e/route_advertisements.go @@ -20,6 +20,7 @@ import ( apitypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/crd/types" udnv1 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/crd/userdefinednetwork/v1" udnclientset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/crd/userdefinednetwork/v1/apis/clientset/versioned" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/test/e2e/deploymentconfig" "github.com/ovn-org/ovn-kubernetes/test/e2e/feature" "github.com/ovn-org/ovn-kubernetes/test/e2e/images" @@ -28,7 +29,6 @@ import ( "github.com/ovn-org/ovn-kubernetes/test/e2e/label" corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -62,18 +62,7 @@ var _ = ginkgo.Describe("BGP: Pod to external server when default podNetwork is f := wrappedTestFramework("pod2external-route-advertisements") ginkgo.BeforeEach(func() { - serverContainerIPs = []string{} - bgpNetwork, err := infraprovider.Get().GetNetwork(bgpExternalNetworkName) - framework.ExpectNoError(err, "must get bgpnet network") - bgpServer := infraapi.ExternalContainer{Name: serverContainerName} - networkInterface, err := infraprovider.Get().GetExternalContainerNetworkInterface(bgpServer, bgpNetwork) - framework.ExpectNoError(err, "container %s attached to network %s must contain network info", serverContainerName, bgpExternalNetworkName) - if isIPv4Supported(f.ClientSet) && len(networkInterface.IPv4) > 0 { - serverContainerIPs = append(serverContainerIPs, networkInterface.IPv4) - } - if isIPv6Supported(f.ClientSet) && len(networkInterface.IPv6) > 0 { - serverContainerIPs = append(serverContainerIPs, networkInterface.IPv6) - } + serverContainerIPs = getBGPServerContainerIPs(f) framework.Logf("The external server IPs are: %+v", serverContainerIPs) providerPrimaryNetwork, err := infraprovider.Get().PrimaryNetwork() framework.ExpectNoError(err, "provider primary network must be available") @@ -185,33 +174,7 @@ var _ = ginkgo.Describe("BGP: Pod to external server when default podNetwork is //10.244.2.0/24 nhid 25 via 172.18.0.4 dev eth0 proto bgp metric 20 for _, serverContainerIP := range serverContainerIPs { for _, node := range nodes.Items { - podv4CIDR, podv6CIDR, err := getNodePodCIDRs(node.Name) - if err != nil { - framework.Failf("Error retrieving the pod cidr from %s %v", node.Name, err) - } - framework.Logf("the pod cidr for node %s-%s is %s", node.Name, podv4CIDR, podv6CIDR) - ipVer := "" - podCIDR := podv4CIDR - nodeIP := e2enode.GetAddressesByTypeAndFamily(&node, corev1.NodeInternalIP, corev1.IPv4Protocol) - if utilnet.IsIPv6String(serverContainerIP) { - ipVer = " -6" - podCIDR = podv6CIDR - // BGP by default uses LLA as nexthops in its routes - nodeIPv6LLA, err := GetNodeIPv6LinkLocalAddressForEth0(node.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nodeIP = []string{nodeIPv6LLA} - } - gomega.Expect(len(nodeIP)).To(gomega.BeNumerically(">", 0)) - framework.Logf("the nodeIP for node %s is %+v", node.Name, nodeIP) - externalContainer := infraapi.ExternalContainer{Name: routerContainerName} - bgpRouteCommand := strings.Split(fmt.Sprintf("ip%s route show %s", ipVer, podCIDR), " ") - framework.Logf("Checking for node %s's route for pod subnet %s", node.Name, podCIDR) - gomega.Eventually(func() bool { - routes, err := infraprovider.Get().ExecExternalContainerCommand(externalContainer, bgpRouteCommand) - framework.ExpectNoError(err, "failed to get BGP routes from intermediary router") - framework.Logf("Routes in FRR %s", routes) - return strings.Contains(routes, nodeIP[0]) - }, 30*time.Second).Should(gomega.BeTrue()) + checkL3NodePodRoute(node, serverContainerIP, routerContainerName, types.DefaultNetworkName) } } @@ -382,6 +345,12 @@ var _ = ginkgo.Describe("BGP: Pod to external server when CUDN network is advert externalServerV4CIDR, externalServerV6CIDR, err := bgpNetwork.IPv4IPv6Subnets() framework.ExpectNoError(err, "must get BGP network subnets") framework.Logf("the network cidrs to be imported are v4=%s and v6=%s", externalServerV4CIDR, externalServerV6CIDR) + var nodeIPv6LLA string + if isDualStackCluster(nodes) { + var err error + nodeIPv6LLA, err = GetNodeIPv6LinkLocalAddressForEth0(routerContainerName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } for _, node := range nodes.Items { ipVer := "" bgpRouteCommand := strings.Split(fmt.Sprintf("ip%s route show %s", ipVer, externalServerV4CIDR), " ") @@ -394,8 +363,6 @@ var _ = ginkgo.Describe("BGP: Pod to external server when CUDN network is advert }, 30*time.Second).Should(gomega.BeTrue()) if isDualStackCluster(nodes) { ipVer = " -6" - nodeIPv6LLA, err := GetNodeIPv6LinkLocalAddressForEth0(routerContainerName) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) bgpRouteCommand := strings.Split(fmt.Sprintf("ip%s route show %s", ipVer, externalServerV6CIDR), " ") framework.Logf("Checking for server's route in node %s", node.Name) gomega.Eventually(func() bool { @@ -535,7 +502,7 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" ipFamilyV6 ) - f := wrappedTestFramework("bpp-network-isolation") + f := wrappedTestFramework("bgp-network-isolation") f.SkipNamespaceCreation = true var udnNamespaceA, udnNamespaceB *corev1.Namespace var nodes *corev1.NodeList @@ -626,7 +593,7 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" } // create host networked Pods - _, err := createPod(f, node.Name+"-hostnet-ep", node.Name, f.Namespace.Name, []string{}, map[string]string{}, func(p *v1.Pod) { + _, err := createPod(f, node.Name+"-hostnet-ep", node.Name, f.Namespace.Name, []string{}, map[string]string{}, func(p *corev1.Pod) { p.Spec.Containers[0].Args = args p.Spec.HostNetwork = true }) @@ -652,6 +619,7 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" svc.Spec.Ports = []corev1.ServicePort{{Port: 8080}} familyPolicy := corev1.IPFamilyPolicyPreferDualStack svc.Spec.IPFamilyPolicy = &familyPolicy + svc.Spec.Type = corev1.ServiceTypeNodePort svcNetA, err = f.ClientSet.CoreV1().Services(pod.Namespace).Create(context.Background(), svc, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -675,6 +643,7 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" svc.Name = fmt.Sprintf("service-default") svc.Namespace = "default" svc.Spec.Selector = pod.Labels + svc.Spec.Type = corev1.ServiceTypeNodePort svcNetDefault, err = f.ClientSet.CoreV1().Services(pod.Namespace).Create(context.Background(), svc, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -720,6 +689,20 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" } return condition.Reason }, 30*time.Second, time.Second).Should(gomega.Equal("Accepted")) + + ginkgo.By("ensure routes from UDNs are learned by the external FRR router") + serverContainerIPs := getBGPServerContainerIPs(f) + for _, serverContainerIP := range serverContainerIPs { + for _, node := range nodes.Items { + if cudnA.Spec.Network.Topology == udnv1.NetworkTopologyLayer3 { + checkL3NodePodRoute(node, serverContainerIP, routerContainerName, types.CUDNPrefix+cudnATemplate.Name) + checkL3NodePodRoute(node, serverContainerIP, routerContainerName, types.CUDNPrefix+cudnBTemplate.Name) + } else { + checkL2NodePodRoute(node, serverContainerIP, routerContainerName, cudnATemplate.Spec.Network.Layer2.Subnets) + checkL2NodePodRoute(node, serverContainerIP, routerContainerName, cudnBTemplate.Spec.Network.Layer2.Subnets) + } + } + } }) ginkgo.AfterEach(func() { @@ -754,6 +737,7 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" } if svcNetDefault != nil { err = f.ClientSet.CoreV1().Services(svcNetDefault.Namespace).Delete(context.Background(), svcNetDefault.Name, metav1.DeleteOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) svcNetDefault = nil } @@ -858,7 +842,7 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" framework.ExpectNoError(err) return clientPod.Name, clientPod.Namespace, net.JoinHostPort(srvPodStatus.IPs[ipFamilyIndex].IP.String(), "8080") + "/clientip", clientPodStatus.IPs[ipFamilyIndex].IP.String(), false }), - ginkgo.Entry("pod to pod on different networks and same node should not work", + ginkgo.Entry("pod to pod connectivity on different networks and same node", func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { // podsNetA[2] and podNetB are on the same node clientPod := podsNetA[2] @@ -866,10 +850,29 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" srvPodStatus, err := getPodAnnotationForAttachment(srvPod, namespacedName(srvPod.Namespace, cudnBTemplate.Name)) framework.ExpectNoError(err) - return clientPod.Name, clientPod.Namespace, net.JoinHostPort(srvPodStatus.IPs[ipFamilyIndex].IP.String(), "8080") + "/clientip", curlConnectionTimeoutCode, true + var ( + curlOutput string + curlErr bool + ) + // Test behavior depends on the ADVERTISED_UDN_ISOLATION_MODE environment variable: + // - "loose": Pod connectivity is allowed, test expects success + // - anything else (including unset): Treated as "strict", pod connectivity is blocked + if os.Getenv("ADVERTISED_UDN_ISOLATION_MODE") == "loose" { + clientPodStatus, err := getPodAnnotationForAttachment(clientPod, namespacedName(clientPod.Namespace, cudnATemplate.Name)) + framework.ExpectNoError(err) + + // With the above underlay routing configuration client pod can reach server pod. + curlOutput = clientPodStatus.IPs[ipFamilyIndex].IP.String() + curlErr = false + } else { + curlOutput = curlConnectionTimeoutCode + curlErr = true + } + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(srvPodStatus.IPs[ipFamilyIndex].IP.String(), "8080") + "/clientip", + curlOutput, curlErr }), - ginkgo.Entry("pod to pod on different networks and different nodes should not work", + ginkgo.Entry("pod to pod connectivity on different networks and different nodes", func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { // podsNetA[0] and podNetB are on different nodes clientPod := podsNetA[0] @@ -877,7 +880,21 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" srvPodStatus, err := getPodAnnotationForAttachment(srvPod, namespacedName(srvPod.Namespace, cudnBTemplate.Name)) framework.ExpectNoError(err) - return clientPod.Name, clientPod.Namespace, net.JoinHostPort(srvPodStatus.IPs[ipFamilyIndex].IP.String(), "8080") + "/clientip", curlConnectionTimeoutCode, true + var ( + curlOutput string + curlErr bool + ) + if os.Getenv("ADVERTISED_UDN_ISOLATION_MODE") == "loose" { + clientPodStatus, err := getPodAnnotationForAttachment(clientPod, namespacedName(clientPod.Namespace, cudnATemplate.Name)) + framework.ExpectNoError(err) + + curlOutput = clientPodStatus.IPs[ipFamilyIndex].IP.String() + curlErr = false + } else { + curlOutput = curlConnectionTimeoutCode + curlErr = true + } + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(srvPodStatus.IPs[ipFamilyIndex].IP.String(), "8080") + "/clientip", curlOutput, curlErr }), ginkgo.Entry("pod in the default network should not be able to access an advertised UDN pod on the same node", func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { @@ -909,7 +926,16 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" }), ginkgo.Entry("pod in the UDN should not be able to access a default network service", func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { - return podsNetA[0].Name, podsNetA[0].Namespace, net.JoinHostPort(svcNetDefault.Spec.ClusterIPs[ipFamilyIndex], "8080") + "/clientip", curlConnectionTimeoutCode, true + err := true + out := curlConnectionTimeoutCode + if cudnATemplate.Spec.Network.Topology == udnv1.NetworkTopologyLayer2 { + // FIXME: prevent looping of traffic in L2 UDNs + // bad behaviour: packet is looping from management port -> breth0 -> GR -> management port -> breth0 and so on + // which is a never ending loop + // this causes curl timeout with code 7 host unreachable instead of code 28 + out = "" + } + return podsNetA[0].Name, podsNetA[0].Namespace, net.JoinHostPort(svcNetDefault.Spec.ClusterIPs[ipFamilyIndex], "8080") + "/clientip", out, err }), ginkgo.Entry("pod in the UDN should be able to access kapi in default network service", func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { @@ -954,11 +980,11 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" // options [mss 1360,sackOK,TS val 3006752321 ecr 0,nop,wscale 7], length 0 // 10:59:55.352404 ovn-k8s-mp87 In ifindex 186 0a:58:5d:5d:01:01 ethertype IPv4 (0x0800), length 80: (tos 0x0, ttl 63, id 57264, // offset 0, flags [DF], proto TCP (6), length 60) - // 93.93.1.5.36363 > 172.18.0.2.25022: Flags [S], cksum 0xe0b7 (correct), seq 3879759281, win 65280, + // 169.154.169.12.36363 > 172.18.0.2.25022: Flags [S], cksum 0xe0b7 (correct), seq 3879759281, win 65280, // options [mss 1360,sackOK,TS val 3006752321 ecr 0,nop,wscale 7], length 0 // 10:59:55.352461 ovn-k8s-mp87 Out ifindex 186 0a:58:5d:5d:01:02 ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 64, id 0, // offset 0, flags [DF], proto TCP (6), length 40) - // 172.18.0.2.25022 > 93.93.1.5.36363: Flags [R.], cksum 0x609d (correct), seq 0, ack 3879759282, win 0, length 0 + // 172.18.0.2.25022 > 169.154.169.12.36363: Flags [R.], cksum 0x609d (correct), seq 0, ack 3879759282, win 0, length 0 // 10:59:55.352927 319594f193d4d_3 Out ifindex 191 0a:58:5d:5d:01:02 ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 64, id 0, // offset 0, flags [DF], proto TCP (6), length 40) // 172.18.0.2.25022 > 93.93.1.5.36363: Flags [R.], cksum 0x609d (correct), seq 0, ack 1, win 0, length 0 @@ -971,25 +997,116 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), podsNetA[2].Spec.NodeName, metav1.GetOptions{}) framework.ExpectNoError(err) nodeIP := node.Status.Addresses[ipFamilyIndex].Address - errBool := false - out := "" - if cudnATemplate.Spec.Network.Topology == udnv1.NetworkTopologyLayer2 { - // FIXME: this should be removed once we add the SNAT for pod->node traffic - // We now permit asymmetric traffic on LGW. This prevents the issue from occurring with IPv6. - // However, for IPv4 LGW rp_filter is still blocking the replies. - // The situation is different on SGW as we don't allow asymmetric traffic at all, which is why IPv6 traffic fails there too. - if ipFamilyIndex == ipFamilyV4 || !isLocalGWModeEnabled() { - // FIXME: fix assymmetry in L2 UDNs - // bad behaviour: packet is coming from other node -> entering eth0 -> bretho and here kernel drops the packet since - // rp_filter is set to 1 in breth0 and there is an iprule that sends the packet to mpX interface so kernel sees the packet - // having return path different from the incoming interface. - // The SNAT to nodeIP should fix this. - // this causes curl timeout with code 28 - errBool = true - out = curlConnectionTimeoutCode - } + + clientNode, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), clientPod.Spec.NodeName, metav1.GetOptions{}) + framework.ExpectNoError(err) + clientNodeIP := clientNode.Status.Addresses[ipFamilyIndex].Address + // pod -> node traffic should use the node's IP as the source for advertised UDNs. + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(hostNetworkPort)) + "/clientip", clientNodeIP, false + }), + ginkgo.Entry("UDN pod to the same node nodeport service in default network should not work", + // FIXME: https://github.com/ovn-kubernetes/ovn-kubernetes/issues/5410 + func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { + clientPod := podsNetA[0] + // podsNetA[0] is on nodes[0]. We need the same node. Let's hit the nodeport on nodes[0]. + node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodes.Items[0].Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + nodeIP := node.Status.Addresses[ipFamilyIndex].Address + nodePort := svcNetDefault.Spec.Ports[0].NodePort + + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(nodePort)) + "/hostname", curlConnectionTimeoutCode, true + }), + ginkgo.Entry("UDN pod to a different node nodeport service in default network should work", + func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { + clientPod := podsNetA[0] + // podsNetA[0] is on nodes[0]. We need a different node. podNetDefault is on nodes[1]. + // The service is backed by podNetDefault. Let's hit the nodeport on nodes[2]. + node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodes.Items[2].Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + nodeIP := node.Status.Addresses[ipFamilyIndex].Address + nodePort := svcNetDefault.Spec.Ports[0].NodePort + + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(nodePort)) + "/hostname", "", false + }), + ginkgo.Entry("UDN pod to the same node nodeport service in same UDN network should work", + func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { + clientPod := podsNetA[0] + // The service is backed by pods in podsNetA. + // We want to hit the nodeport on the same node. + // client is on nodes[0]. Let's hit nodeport on nodes[0]. + node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodes.Items[0].Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + nodeIP := node.Status.Addresses[ipFamilyIndex].Address + nodePort := svcNetA.Spec.Ports[0].NodePort + + // The service can be backed by any of the pods in podsNetA, so we can't reliably check the output hostname. + // Just check that the connection is successful. + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(nodePort)) + "/hostname", "", false + }), + ginkgo.Entry("UDN pod to a different node nodeport service in same UDN network should work", + func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { + clientPod := podsNetA[0] + // The service is backed by pods in podsNetA. + // We want to hit the nodeport on a different node. + // client is on nodes[0]. Let's hit nodeport on nodes[2]. + node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodes.Items[2].Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + nodeIP := node.Status.Addresses[ipFamilyIndex].Address + nodePort := svcNetA.Spec.Ports[0].NodePort + + // sourceIP will be joinSubnetIP for nodeports, so only using hostname endpoint + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(nodePort)) + "/hostname", "", false + }), + ginkgo.Entry("UDN pod to the same node nodeport service in different UDN network should not work", + // FIXME: This test should work: https://github.com/ovn-kubernetes/ovn-kubernetes/issues/5419 + // This traffic flow is expected to work eventually but doesn't work today on Layer3 (v4 and v6) and Layer2 (v4 only) networks. + // Reason it doesn't work today is because UDN networks don't have MAC bindings for masqueradeIPs of other networks. + // Traffic flow: UDN pod in network A -> samenode nodeIP:nodePort service of networkB + // UDN pod in networkA -> ovn-switch -> ovn-cluster-router (SNAT to masqueradeIP of networkA) -> mpX interface -> + // enters the host and hits IPTables rules to DNAT to clusterIP:Port of service of networkB. + // Then it hits the pkt_mark flows on breth0 and get's sent into networkB's patchport where it hits the GR. + // On the GR we DNAT to backend pod and SNAT to joinIP. + // Reply: Pod replies and now OVN in networkB tries to ARP for the masqueradeIP of networkA which is the source and simply + // fails as it doesn't know how to reach this masqueradeIP. + // There is also inconsistency in behaviour within Layer2 networks for how IPv4 works and how IPv6 works where the traffic + // works on ipv6 because of the flows described below. + func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { + clientPod := podsNetA[0] + node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodes.Items[0].Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + nodeIP := node.Status.Addresses[ipFamilyIndex].Address + nodePort := svcNetB.Spec.Ports[0].NodePort + out := curlConnectionTimeoutCode + errBool := true + if ipFamilyIndex == ipFamilyV6 && cudnATemplate.Spec.Network.Topology == udnv1.NetworkTopologyLayer2 { + // For Layer2 networks, we have these flows we add on breth0: + // cookie=0xdeff105, duration=173.245s, table=1, n_packets=0, n_bytes=0, idle_age=173, priority=14,icmp6,icmp_type=134 actions=FLOOD + // cookie=0xdeff105, duration=173.245s, table=1, n_packets=8, n_bytes=640, idle_age=4, priority=14,icmp6,icmp_type=136 actions=FLOOD + // which floods the Router Advertisement (RA, type 134) and Neighbor Advertisement (NA, type 136) + // Given on Layer2 the GR has the SNATs for both masqueradeIPs this works perfectly well and + // the networks are able to NDP for the masqueradeIPs for the other networks. + // This doesn't work on Layer3 networks since masqueradeIP SNATs are present on the ovn-cluster-router in that case. + // See the tcpdump on the issue: https://github.com/ovn-kubernetes/ovn-kubernetes/issues/5410 for more details. + out = "" + errBool = false } - return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(hostNetworkPort)) + "/hostname", out, errBool + + // sourceIP will be joinSubnetIP for nodeports, so only using hostname endpoint + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(nodePort)) + "/hostname", out, errBool + }), + ginkgo.Entry("UDN pod to a different node nodeport service in different UDN network should work", + func(ipFamilyIndex int) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { + clientPod := podsNetA[0] + // The service is backed by podNetB. + // We want to hit the nodeport on a different node from the client. + // client is on nodes[0]. Let's hit nodeport on nodes[2]. + node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodes.Items[2].Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + nodeIP := node.Status.Addresses[ipFamilyIndex].Address + nodePort := svcNetB.Spec.Ports[0].NodePort + + // sourceIP will be joinSubnetIP for nodeports, so only using hostname endpoint + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(nodePort)) + "/hostname", "", false }), ) @@ -2215,3 +2332,89 @@ func createRouteAdvertisements( return nil } + +// getBGPServerContainerIPs retrieves the IP addresses of the BGP server container. +func getBGPServerContainerIPs(f *framework.Framework) (serverContainerIPs []string) { + bgpNetwork, err := infraprovider.Get().GetNetwork(bgpExternalNetworkName) // pre-created network + framework.ExpectNoError(err, "must get bgpnet network") + bgpServer := infraapi.ExternalContainer{Name: serverContainerName} + networkInterface, err := infraprovider.Get().GetExternalContainerNetworkInterface(bgpServer, bgpNetwork) + framework.ExpectNoError(err, "container %s attached to network %s must contain network info", serverContainerName, bgpExternalNetworkName) + if isIPv4Supported(f.ClientSet) && len(networkInterface.IPv4) > 0 { + serverContainerIPs = append(serverContainerIPs, networkInterface.IPv4) + } + if isIPv6Supported(f.ClientSet) && len(networkInterface.IPv6) > 0 { + serverContainerIPs = append(serverContainerIPs, networkInterface.IPv6) + } + return +} + +// checkL3NodePodRoute checks that the BGP route for the given node's pod subnet is present in the FRR router. +// It takes the node to check, a serverContainerIP to determine the IP family in use, and the router container name. +func checkL3NodePodRoute(node corev1.Node, serverContainerIP, routerContainerName, netName string) { + var podv4CIDR, podv6CIDR string + + gomega.Eventually(func() error { + var err error + podv4CIDR, podv6CIDR, err = getNodePodCIDRs(node.Name, netName) + return err + }, 5*time.Second).Should(gomega.Succeed(), "failed to get pod CIDR for node %s, network %s", node.Name, netName) + + framework.Logf("The pod CIDRs for node %s are: v4=%s, v6=%s", node.Name, podv4CIDR, podv6CIDR) + isIPv6 := utilnet.IsIPv6String(serverContainerIP) + podCIDR := podv4CIDR + if isIPv6 { + podCIDR = podv6CIDR + } + gomega.Expect(podCIDR).NotTo(gomega.BeEmpty(), + "pod CIDR for family (isIPv6=%t) missing for node %s on network %s", isIPv6, node.Name, netName) + + checkRouteInFRR(node, podCIDR, routerContainerName, isIPv6) +} + +// checkL2NodePodRoute checks that BGP routes for the given CIDRs are present in the FRR router. +func checkL2NodePodRoute(node corev1.Node, serverContainerIP, routerContainerName string, cidrs udnv1.DualStackCIDRs) { + isServerIPv6 := utilnet.IsIPv6String(serverContainerIP) + for _, podCIDR := range cidrs { + isPodCIDRv6 := utilnet.IsIPv6CIDRString(string(podCIDR)) + // Skip checking if the CIDR family does not match the serverContainerIP family. + if isServerIPv6 != isPodCIDRv6 { + continue + } + checkRouteInFRR(node, string(podCIDR), routerContainerName, isPodCIDRv6) + } +} + +// checkRouteInFRR verifies that a route for a given podCIDR exists in the specified FRR container, +// with the correct node as its nexthop. +func checkRouteInFRR(node corev1.Node, podCIDR, routerContainerName string, isIPv6 bool) { + var ( + ipVer string + nodeIP []string + err error + ) + + if isIPv6 { + ipVer = " -6" + // BGP uses the link-local address as the nexthop for IPv6 routes by default. + var nodeIPv6LLA string + nodeIPv6LLA, err = GetNodeIPv6LinkLocalAddressForEth0(node.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + nodeIP = []string{nodeIPv6LLA} + } else { + nodeIP = e2enode.GetAddressesByTypeAndFamily(&node, corev1.NodeInternalIP, corev1.IPv4Protocol) + } + gomega.Expect(len(nodeIP)).To(gomega.BeNumerically(">", 0), "must find a valid nexthop IP for node %s", node.Name) + framework.Logf("Using nexthop %s for node %s", nodeIP[0], node.Name) + + externalContainer := infraapi.ExternalContainer{Name: routerContainerName} + bgpRouteCommand := strings.Split(fmt.Sprintf("ip%s route show %s", ipVer, podCIDR), " ") + framework.Logf("Checking on router %s for node %s's route to pod subnet %s", routerContainerName, node.Name, podCIDR) + + gomega.Eventually(func() bool { + routes, err := infraprovider.Get().ExecExternalContainerCommand(externalContainer, bgpRouteCommand) + framework.ExpectNoError(err, "failed to get BGP routes from intermediary router") + framework.Logf("Routes in FRR for %s: %s", podCIDR, routes) + return strings.Contains(routes, nodeIP[0]) + }, 30*time.Second).Should(gomega.BeTrue(), "route for %s via %s not found on %s", podCIDR, nodeIP[0], routerContainerName) +}