diff --git a/dist/templates/k8s.ovn.org_clusteruserdefinednetworks.yaml.j2 b/dist/templates/k8s.ovn.org_clusteruserdefinednetworks.yaml.j2 index 88901528d4..36c7f050fd 100644 --- a/dist/templates/k8s.ovn.org_clusteruserdefinednetworks.yaml.j2 +++ b/dist/templates/k8s.ovn.org_clusteruserdefinednetworks.yaml.j2 @@ -313,6 +313,14 @@ spec: rule: '!has(self.infrastructureSubnets) || !has(self.reservedSubnets) || self.infrastructureSubnets.all(infra, !self.reservedSubnets.exists(reserved, cidr(infra).containsCIDR(reserved) || cidr(reserved).containsCIDR(infra)))' + - message: infrastructureSubnets must be a masked network address + (no host bits set) + rule: '!has(self.infrastructureSubnets) || self.infrastructureSubnets.all(s, + isCIDR(s) && cidr(s) == cidr(s).masked())' + - message: reservedSubnets must be a masked network address (no + host bits set) + rule: '!has(self.reservedSubnets) || self.reservedSubnets.all(s, + isCIDR(s) && cidr(s) == cidr(s).masked())' layer3: description: Layer3 is the Layer3 topology configuration. properties: diff --git a/dist/templates/k8s.ovn.org_userdefinednetworks.yaml.j2 b/dist/templates/k8s.ovn.org_userdefinednetworks.yaml.j2 index 9b855382a9..073c64bac0 100644 --- a/dist/templates/k8s.ovn.org_userdefinednetworks.yaml.j2 +++ b/dist/templates/k8s.ovn.org_userdefinednetworks.yaml.j2 @@ -257,6 +257,14 @@ spec: rule: '!has(self.infrastructureSubnets) || !has(self.reservedSubnets) || self.infrastructureSubnets.all(infra, !self.reservedSubnets.exists(reserved, cidr(infra).containsCIDR(reserved) || cidr(reserved).containsCIDR(infra)))' + - message: infrastructureSubnets must be a masked network address + (no host bits set) + rule: '!has(self.infrastructureSubnets) || self.infrastructureSubnets.all(s, + isCIDR(s) && cidr(s) == cidr(s).masked())' + - message: reservedSubnets must be a masked network address (no host + bits set) + rule: '!has(self.reservedSubnets) || self.reservedSubnets.all(s, + isCIDR(s) && cidr(s) == cidr(s).masked())' layer3: description: Layer3 is the Layer3 topology configuration. properties: diff --git a/docs/okeps/okep-5494-ovn-kubernetes-mcp-server.md b/docs/okeps/okep-5494-ovn-kubernetes-mcp-server.md new file mode 100644 index 0000000000..cffe1970d5 --- /dev/null +++ b/docs/okeps/okep-5494-ovn-kubernetes-mcp-server.md @@ -0,0 +1,798 @@ +# OKEP-5494: Model Context Protocol for Troubleshooting OVN-Kubernetes + +# Problem Statement + +Diagnosing an issue in [OVN-Kubernetes](https://ovn-kubernetes.io/) network +plugin is complex because it has many layers (Kubernetes, OVN-Kubernetes, OVN, +OpenvSwitch, Kernel - especially Netfilter elements). Usually the person +troubleshooting an issue has to approach it in a layered fashion and has +to be fully aware of all the debugging tools each layer has to offer to +then be able to pin point where the problem is. That is time consuming +and requires years of expertise on each depth of the stack and across all +features. Sometimes it also involves working with the layered community +project teams like OVN and OVS since they hold more knowledge about their +domain than engineers working on the OVN-Kubernetes plugin. Each +troubleshooting session to understand where the packet is getting +blackholed or dropped takes a lot of time to solve (unless it's trivial). + +# Goals + +The goal of this enhancement is to try to improve "troubleshooting time", +"tool usage/typing time" and "erase the need to know how to use each of those +tools to a parameter level detail" by exposing these tools using +[Model Context Protocol](https://modelcontextprotocol.io/docs/learn/architecture#overview) (MCP) +and leveraging the backend Model's (Claude Sonnet4, Gemini2.5Pro, GPT-5, etc) +knowledge and context to troubleshoot issues on live clusters or locally on +containers with end user's databases loaded. This can go a long +way in speeding up bug triages. + +* Phase1 targets only ovn-kubernetes community members as target audience +* Build an OVN-Kubernetes MCP Server(s) that exposes all the tools required + to troubleshoot OVN-Kubernetes network plugin + * This MCP Server(s) must also be aware of where to run these tools to + troubleshoot the issue in question on a cluster (i.e Which node? Which + pod? Which container?) +* Add support to load a MCP Server against not just a real-time cluster + but also against a simulated environment constructed from information bundle + gathered or other debugging information extracted from a live cluster. + Examples are [must-gather](https://github.com/openshift/must-gather) and + [sos-reports](https://github.com/sosreport/sos) and how tools like + [omc](https://github.com/gmeghnag/omc) or [xsos](https://github.com/ryran/xsos) + can be leveraged +* Ensure that the tools we expose have read only permissions. Whether this means + we restrict the tools itself with read rights only OR expose only read actions + via the tools is to be discussed in proposal section. +* Support troubleshooting all features for OVN-Kubernetes + * We must have extensive failure scenario and chaos tests injected to see + how good LLM is able to troubleshoot + * We must form a list of all common ovn-kubernetes bugs we have hit across + features (example all possible ways of running into staleSNATs) + * We must add benchmarking tests for evaluating the quality of troubleshooting + specific to CNI Networking. + +# Future Goals + +* Phase2 targets end-users and others using OVN-Kubernetes to troubleshoot + issues +* Expand the MCP server to not just provide tools but also relevant context + around OVN-Kubernetes implementation +* Creating a RAGing system with all internal docs and integrating that with + the prompt client for better results (would require improving our docs + first). This is especially important for the LLM to know how OVN-Kubernetes + implements each traffic flow and feature and what constructs are created + underneath into the layers. This might not be needed for older features + like EgressIPs that has plenty of online resources but for newer features + like RouteAdvertisements, the LLM may not know much without providing that + extra context. +* Investigate if there is potential for converting it also into a remediation + system (this would need write access) +* Work with the Layered project community teams (OVN, OpenvSwitch, Kernel) + for each of those layers to also own an MCP server since they know their + stack better. So instead of 1 OVN-Kubernetes MCP Server it would be a + bunch of servers each owned by specific upstream community projects for + better maintenance and ownership +* The same MCP Server could also potentially be used as a + "OVN-Kubernetes network plugin - know it better" chatbot but that is not + the initial goal here. + +# Future-Stretch-Goals + +* Phase3 includes this getting productized and shipped to end-users using + OVN-Kubernetes in some far fetched future to run it on production to + troubleshoot. But this would require: + * Having a better overall architecture for the wholesome agentic AI + troubleshooting solution on a cluster + * Solving the compute problem of how and where to run the model + * Having an air tight security vetting. + + Running this stack in production is out-of-scope for this OKEP; it is a + future-stretch goal contingent on security and testing milestones. + +# Non-Goals + +* We will not be solving the problem around which LLM should be used. Most of + the good ones (based on community member experience) were proprietary (claude sonnet4 + and gemini2.5pro) but testing all LLMs and coming up with which works the + best is not in scope + * By not solving this problem as part of this enhancement, we risk having + to deal with "long-context windows causing hallucinations" but that's + where RAGing mentioned in future goals could help. +* We will also not be developing our own model to teach and maintain it + * By not solving this problem, we also risk relying on proprietary models + knowing and learning what we want them to learn but having no control on + how fast they learn it. Again RAGing could help here. + * So the quality here will heavily depend on how good the brain LLM is + which basically won't be in our control much. + +# Introduction + +An engineer troubleshooting OVN-Kubernetes usually uses the following set of +CLI tools in a layered fashion: + +* Kubernetes and OVN-Kubernetes layer - this cluster state information is + gathered as part of must-gather for offline troubleshooting + * **kubectl** commands like list, get, describe, logs, exec, events to know + everything about the Kubernetes API state of the feature and to know what + the ovnkube pods were doing through the logs they generate during that + window + * **ovnkube-trace** which executes ovn-trace and ovs ofproto trace and + detrace commands (this tool doesn't support all scenarios - it's not + maintained well) + * Future-tool - ovnkube CLI (need to ask NVIDIA what's the status here) + * A place for K8s state-database syncer should/could potentially exist +* OVN Layer - OVN databases are gathered as part of must-gather for + offline troubleshooting + * **ovn-nbctl** commands that are executed on the ovnkube node pods to + understand what OVN-Kubernetes created into northbound database via + libovsdbclient transactions + * **ovn-sbctl** commands that are executed on the ovnkube node pods to + understand what northd created into southbound database + * **ovn-trace and detrace** commands that are executed on the ovnkube node + pods to understand simulated packet flow tracing based on flows in + southbound database + * **ovn-appctl -t ovn-controller ct-zone-list** to list all the conntrack + zone to OVN construct mapping for better understanding how the conntrack + commit of the packets happened +* OpenvSwitch Layer - openvswitch database is usually gathered as part + of sos-report for offline troubleshooting + * **ovs-ofctl dump-flows** to debug specially the breth0 openflows + that OVN-Kubernetes creates on the gateway of each node + * **ovs-appctl dpctl/dump-flows** to trace live packets run on a specific + node's ovs container (KIND) or on the node (OpenShift) + * **ovs-appctl ofproto/trace** and detrace to run an ovs trace of the + packet based on the openflows + * **ovs-appctl dpctl/dump-conntrack** to know all the conntrack zones used + for a specific connection + * **ovs-vsctl** commands to list interfaces and bridges + * **retis** to see packet drops in ovs +* Netfilter/Kernel Layer - this information is usually gathered as part + of sos-report for offline troubleshooting + * **ip util commands** like **ip r** or **ip a** or **ip rule list** for + debugging VRFs, BGP learnt routes or the routes OVN-Kubernetes creates + on the node or custom routes that end user's add + * **nft list ruleset** to understand what rules were created by + OVN-Kubernetes specially in routingViaHost=true gateway mode + * **iptables-save** to list all iptables (Given iptables is deprecated, + I think we can skip this tool though for now 50% of ovn-kubernetes is + still on IPT) + * **conntrack** -L or -E on the host itself + * **ip xfrm policy** and **ip xfrm state** when using IPSEC +* TCPDUMP - external open source tools, can't be used for offline + troubleshooting + * **tcpdump** is used for packet capture and analysis + * [**pwru**](https://github.com/cilium/pwru) is used to know the kernel drop reason + * **libreswan** `ipsec-stateus` and `ipsec-trafficstatus` commands + * **frr** frr router config and routes learnt by BGP + +Ideally speaking, there are metrics and events that via alerts also go to +the dashboard which is probably what most end-users use to troubleshoot. +So when we do the phase3 we would need to reconsider this stack of +troubleshooting entirely for including other aspects like OVN-Kubernetes +troubleshooting dashboard that the observability team created or the +various packet drop tools observability team already exposes. But for +the scope of this enhancement, for now, we will consider these above set +of tools as MVP. + +As we can see, that's a lot of tools! So remembering the syntax for each of +these tools always and executing them one-by-one and gathering the information +at each layer, analysing them, and then moving to the next layer takes time +for a human. Always during a remote analysis of bug report the part that takes +the longest is the RCA by combing through all the data - same goes for +troubleshooting a cluster (which is slightly easier when we have access to the +cluster than analysing offline data). The fix is usually the easiest part (there are +exceptions). + +``` + OVN-Kubernetes Architecture & Troubleshooting Tools + (Per Node Components) + +┌────────────────────────────────────────────────────────────────────────────┐ +│ ovnkube-node pod │ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ kubectl exec/logs │ +│ │ ovnkube │◄──►│ NBDB │◄─────── ovn-nbctl show/list │ +│ │ controller │ │ (northbound) │ │ +│ └─────────────────┘ └─────────────────┘ │ +│ │ │ │ +│ │ ┌─────────────────┐ │ +│ │ │ northd │ │ +│ │ └─────────────────┘ │ +│ │ │ │ +│ │ ┌─────────────────┐ │ +│ └─────────────►│ SBDB │◄─────── ovn-sbctl show/list │ +│ │ (southbound) │ ovn-trace/detrace │ +│ └─────────────────┘ │ +│ │ │ +│ ┌─────────────────┐ │ +│ │ ovn-controller │◄─────── ovn-appctl ct-zone │ +│ └─────────────────┘ │ +│ │ │ +└───────────────────────────────────┼────────────────────────────────────────┘ + │ + ┌─────────────────┐ + │ OVS │◄─────── ovs-vsctl list + │ (database) │ ovs-appctl dpctl/ + └─────────────────┘ ovs-ofctl dump-flows + │ retis (packet drops) + ┌─────────────────┐ + │ OVS bridge │◄─────── ovs-appctl ofproto/trace + │ br-int/breth0 │ + └─────────────────┘ + │ + ┌─────────────────┐ + │ NIC │ + │ (physical) │ + └─────────────────┘ + │ + ┌──────────────┴──────────────┐ + │ Host Network │ + │ │ + │ ip route/addr/rule ◄───────┼─────── ip commands + │ nft ruleset ◄───────┼─────── nft list ruleset + │ iptables rules ◄───────┼─────── iptables-save + │ conntrack zones ◄───────┼─────── conntrack -L/-E + │ │ + │ Network interfaces ◄───────┼─────── tcpdump/pwru + └─────────────────────────────┘ + + Problem: Engineers must know WHERE to run WHICH tool on WHICH component + to troubleshoot issues across this distributed architecture +``` + +This enhancement aims to solve this pain point of reducing the time taken to +execute these tools and analyse these results using MCP Servers and LLMs. + +# User Stories + +**As an OVN-Kubernetes developer**, **I want to** troubleshoot my stack +without needing to know every tool's parameter fields by-heart or by spending +time looking it up each time I need to troubleshoot a feature **so that** I +can spend my time efficiently. I just want to tell in plain english what I +want and for the MCP server to execute those specific commands. + +**As an OVN-Kubernetes engineer**, **I want to** troubleshoot my stack +without needing to analyse each flow output of these tools when I need to +troubleshoot a feature **so that** I can spend my time efficiently. I just +want to tell in plain english what I want and for the LLM to help me analyze +the output of the commands executed by the MCP Server. I understand that I +will need to verify the reasoning thoroughly before accepting the RCA from AI. + +**As a new engineer joining the OVN-Kubernetes team**, **I want to** retrieve +specific information from different parts of the stack without having +knowledge of the topology or tooling of the stack. + +# Proposed Solution + +We build a Golang MCP Server (could be split into a set of MCP Servers in the +future) that exposes these tools in read only fashion and the LLM backend +that has the required context will analyse the results of the execution and +provide a response back to the prompter who has to verify it thoroughly. This +MCP Server code will be in a new repo in +[ovn-kubernetes org](https://github.com/ovn-kubernetes) called +**ovn-kubernetes-mcp**. + +## Example Workflow for an end-user + +1. An end-user can use any MCP Client to start a troubleshooting session via + prompting. The client connects with all the available servers (in our case + the OVN-Kubernetes MCP Server and maybe in the future all layered community + MCP Severs) and gathers their available tools and presents this information + to the LLM along with their schemas. Example: Using Cursor AI as your MCP + Client +2. LLM will be able use its intelligence and analyze the end-user query and + choose the appropriate tools. Example: Using Claude Sonnet4 as your LLM + model +3. MCP Client then receives the LLM's tool call and routes it to the + corresponding MCP Server which executes the tool and client then relays + the response back to the LLM +4. LLM again uses its intelligence to analyze the responses +5. LLM provides a RCA back to the end user + +The steps 2, 3 and 4 is repeated by the LLM and it intelligently does a step-by-step +layered troubleshooting exercise to find the root cause. + +We may also include predefined and tested prompt steps in the documentation +of our repo for helping end-users to give the LLM good context around +OVN-Kubernetes and OVN and OVS. For example, provide the latest OVN man +pages so that it has the current knowledge of OVN DB schemas and tools usage. +Some standard preparation prompt can be maintained in the mcp-server repo +as reference. + +``` +OVN-Kubernetes Troubleshooting MCP Architecture +=============================================== + +Engineer Query: "Pod A can't reach Pod B on different nodes, consistent connection drops" + +┌──────────────────────────────────────────────────────────────────────────────┐ +│ MCP SERVERS (Layer-Specific Tools) │ +└──────────────────────────────────────────────────────────────────────────────┘ + +┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ +│ Kubernetes/K8s │ │ OVN Layer │ │ OpenvSwitch │ │ Netfilter/ │ │ TCPDUMP/Debug │ +│ MCP Server │ │ MCP Server │ │ Layer MCP │ │ Kernel MCP │ │ MCP Server │ +│ │ │ │ │ Server │ │ Server │ │ │ +│ Tools: │ │ Tools: │ │ │ │ │ │ Tools: │ +│ • kubectl_get │ │ • ovn_nbctl_show │ │ Tools: │ │ Tools: │ │ • tcpdump_capture │ +│ • kubectl_describe │ │ • ovn_nbctl_list │ │ • ovs_ofctl_flows │ │ • ip_route_show │ │ • tcpdump_analyze │ +│ • kubectl_logs │ │ • ovn_sbctl_show │ │ • ovs_appctl_dpctl │ │ • ip_addr_show │ │ • pwru_trace │ +│ • kubectl_exec │ │ • ovn_sbctl_list │ │ • ovs_ofproto_trace │ │ • ip_rule_show │ │ │ +│ • kubectl_events │ │ • ovn_trace │ │ • ovs_appctl_conntr │ │ • nft_list_ruleset │ │ │ +│ • ovnkube_trace │ │ • ovn_detrace │ │ • ovs_vsctl_show │ │ • iptables_save │ │ │ +│ │ │ • ovn_controller_ct │ │ • retis_capture │ │ • conntrack_list │ │ │ +│ │ │ │ │ │ │ • conntrack_events │ │ │ +└──────────────────────┘ └──────────────────────┘ └──────────────────────┘ └──────────────────────┘ └──────────────────────┘ + + │ All tools aggregated + ▼ + +┌──────────────────────────────────────────────────────────────────────────────┐ +│ MCP CLIENT │ +│ (OVN-K8s Troubleshoot AI) │ +│ │ +│ Unified Tool Interface: │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ Layer 1 (K8s): kubectl_*, ovnkube_trace │ │ +│ │ Layer 2 (OVN): ovn_nbctl_*, ovn_sbctl_*, ovn_trace_* │ │ +│ │ Layer 3 (OVS): ovs_ofctl_*, ovs_appctl_*, ovs_vsctl_*, retis_* │ │ +│ │ Layer 4 (Kernel): ip_*, nft_*, iptables_*, conntrack_* │ │ +│ │ Layer 5 (Debug): tcpdump_*, pwru_* │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────────┘ + + │ Present unified interface + ▼ + +┌──────────────────────────────────────────────────────────────────────────────┐ +│ LLM (OVN-K8s Expert) │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` + +Doing multiple MCP Servers has clear advantages like each layer owning their +toolset, independent development and maintenance, granular RBAC, reusability, +one server can't affect the other. Given we would need to align with different +layered communities later on for a fully supported set of servers from their +side which might take several releases, for the phase1 here, we plan to build +our own monolithic server that could then be split into multiple servers later +on. A single unified server is simpler, faster to iterate on. + +So our current implementation design looks like this: + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ MCP Client │ +│ "Debug pod connectivity issues" │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + MCP Protocol + │ + ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ OVN-Kubernetes MCP Server │ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌──────────────────────────────┐ │ +│ │ Kubernetes │ │ Live Cluster │ │ Offline Bundle │ │ +│ │ Layer │ │ Execution │ │ Execution │ │ +│ │ │ │ │ │ │ │ +│ │ • kubectl get │ │ kubectl exec │ │ Offline artifacts parser │ │ +│ │ • kubectl desc │ │ │ │ tools. │ │ +│ │ • kubectl logs │ │ Direct API │ │ Example: xsos and omc │ │ +│ └─────────────────┘ └─────────────────┘ └──────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ Tool Categories │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌──────────────┐ ┌─────────────┐ │ │ +│ │ │ OVN Layer │ │ OVS Layer │ │Kernel Layer │ │External │ │ │ +│ │ │ │ │ │ │ │ │Tools │ │ │ +│ │ │ ovn-nbctl │ │ ovs-ofctl │ │ ip route │ │ tcpdump │ │ │ +│ │ │ ovn-sbctl │ │ ovs-appctl │ │ nft list │ │ pwru │ │ │ +│ │ │ ovn-trace │ │ ovs-vsctl │ │ conntrack │ │ retis │ │ │ +│ │ │ ovn-detrace │ │ ovs-dpctl │ │ iptables-save│ │ │ │ │ +│ │ │ ovn-appctl │ │ ovs-ofproto │ │ │ │ │ │ │ +│ │ └─────────────┘ └─────────────┘ └──────────────┘ └─────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ Security & RBAC │ │ +│ │ │ │ +│ │ • ovn-troubleshooter ClusterRole (read-only) │ │ +│ │ • Command parameter validation │ │ +│ │ • Node-specific targeting required │ │ +│ │ • Write operations blocked │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────────┘ + │ + ┌───────────┼───────────┐ + │ │ │ + ▼ ▼ ▼ + ┌────────┐ ┌────────┐ ┌────────┐ + │ Node1 │ │ Node2 │ │ NodeN │ + │ │ │ │ │ │ + │ovnkube-│ │ovnkube-│ │ovnkube-│ + │node pod│ │node pod│ │node pod│ + │ │ │ │ │ │ + │ ovn-nb │ │ ovn-nb │ │ ovn-nb │ + │ ovn-sb │ │ ovn-sb │ │ ovn-sb │ + │ ovs │ │ ovs │ │ ovs │ + └────────┘ └────────┘ └────────┘ + +Data Flow Examples: +├─ Live: kubectl exec ovnkube-node-xyz -c nb-ovsdb -- ovn-nbctl show +├─ Live: kubectl debug node/worker-1 -- ovs-ofctl dump-flows br-int +├─ Live: kubectl debug node/worker-1 -- ip route show table all +├─ Offline: Parse must-gather/ovn-kubernetes/ovn-northbound.db +├─ Offline: Parse sos-report/node-1/openvswitch/ovs-ofctl_dump-flows +└─ Offline: Parse sos-report/node-1/networking/ip_route +``` + +Note: Some of the layers like Kubernetes and Offline Debugging have +existing servers like [kubernetes-mcp-server](https://github.com/containers/kubernetes-mcp-server) +and [mustgather-mcp-server](https://github.com/shivprakashmuley/mustgather-mcp-server) +can be re-used together with ovn-kubernetes-mcp server for holistic +end user experience. However kubernetes-mcp-server exposes `kubectl-exec` +which has security implications (although they also have a read-only +mode where only read commands are exposed). + +Note2: Feeding container logs to LLM will make the context window +full pretty fast. We need to investigate a method to ensure we are +filtering out relevant logs to feed. + +## Implementation Details of the OVN-Kubernetes MCP Server + +See the Alternatives section for other ideas that were discarded. + +### Chosen Approach: Direct CLI Tool Exposure (Idea1) + +The initial implementation takes a pragmatic approach by directly exposing the +existing CLI tools (`ovn-nbctl`, `ovn-sbctl`, `ovs-vsctl`, etc.) as MCP tools. +While this approach may seem less elegant than creating higher-level wrapper +abstractions to connect to the database (see discarded alternatives), it +offers the fastest path to value for OVN-Kubernetes engineers who are already +familiar with these tools but want to leverage LLM assistance for command +construction and output analysis. The CLI-based approach is also optimal for +exposing the complete troubleshooting toolkit across all layers of the stack. +Most other discarded ideas only addressed OVSDB access and reusability +concerns while failing to provide the holistic set of tools needed for +comprehensive root cause analysis like running packet traces. + +The MCP server acts as a secure execution bridge, translating natural language +troubleshooting requests into appropriate CLI commands. + +**Advantages**: + +* **Fastest Time to Value**: Leverages existing tools that engineers already + know +* **Zero Deployment Overhead**: All required CLI binaries are already present + in the pods and nodes within the cluster +* **Comprehensive Coverage**: Only approach that provides access to the + complete troubleshooting toolkit across all stack layers +* **Security Controls**: Enables read-only access enforcement by exposing + only the tools with read-only access (get/list) +* **No version compatibility issues** between the MCP server tools and the + cluster's installed versions when running on live cluster environments + +**Trade-offs**: + +* **Limited Reusability**: Somewhat specific to OVN-Kubernetes deployment + patterns and can't be reused in other layers like OVS + +This approach was selected as the optimal balance between security, +functionality, and development effort. + +## Security Model and RBAC Constraints + +No matter how we approach this implementation, it is impossible to totally +secure the execution of networking troubleshooting tools at this stage. Most +networking tools require privileged access to function properly, creating +inherent security trade-offs. The goal is therefore to minimize blast radius +through layered controls rather than achieve perfect isolation. + +**Kubernetes Layer Security:** + +* `kubectl exec` and `kubectl debug node` operations require cluster-admin level + privileges by design. Avoid exposing generic exec in the K8s layer. + Prefer direct Kubernetes API reads (`get`, `list`, `logs`, `events`). +* Alternative approach using custom debug containers + (`kubectl debug node --image=`) with volume mounts to database + files reduces some attack surface but remains intrusive +* **Mitigation**: Expose only Kubernetes API read operations (`get`, `list`, + `logs`, `events`); remove any generic exec tool in this layer. +* **Constraint**: MCP Server service account requires elevated privileges + (including `pods/exec`) despite being conceptually a "troubleshooter" role + +At first, for the kubernetes layer, we thought of leveraging the opensource +[kubernetes-mcp-server](https://github.com/containers/kubernetes-mcp-server). +So whatever security posture they use for now, can be adopted. They have a +`read-only` mode and a mode where write can be done via kubectl exec. +Later, after getting reviews, this enhancement has changed the approach +and pivotted towards opting into a more secure approach of adding a +tool that is a wrapper on top of `kubectl-exec` without directly exposing +kubectl-exec. So instead of relying on `kubernetes-mcp-server`, our +`ovn-kubernetes-mcp` will take the more secure approach of only using +`kubectl_exec` as an implementation detail but not directly expose it. +Downside of this is that we will need to also account for get or list +resources command being duplicated into `ovn-kubernetes-mcp`. So we +would need to implement the tools we need also on the kubernetes layer +ourselves. + +**OVN/OVS Database Layer Security:** + +* Unix socket-based database access prevents using SSL certificate-based + authentication and authorization +* Database connections inherit the security context of the container executing + the commands +* **Mitigation**: Command parameter validation to ensure only read-only + database operations (`show`, `list`, `dump`) while blocking modification + commands (`set`, `add`, `remove`) +* **Long-term Path**: Requires RBAC-enabled CLI execution even against local + unix socket. + +**Host/Kernel Layer Security:** + +* Kernel-level networking tools (`ip`, `nft`, `conntrack`) inherently require + root system access +* Current tooling lacks granular RBAC capabilities - tools are typically + all-or-nothing from a privilege perspective +* **External Tools Note**: Tools like `tcpdump -i any` can be highly intrusive + as they capture all network traffic on the host, requiring careful + consideration of privacy and performance impact while being chosen for + execution. +* **Short-term Mitigation**: Strict command allowlisting exposing only read + operations (`ip route show`, `nft list ruleset`) while blocking modification + commands (`ip route add`, `nft add rule`) +* **Long-term Path**: Requires RBAC-enabled wrapper tools from upstream + layered community teams (example netfilter, kernel networking) + +## Distributed Execution Context + +MCP Server will only be supported on interconnect mode. + +### **OVN-Kubernetes Architecture Challenge** + +In OVN-Kubernetes interconnect architecture, each node maintains its own local +instances of critical databases and services: + +* **Northbound Database**: Local OVN northbound database per node +* **Southbound Database**: Local OVN southbound database per node +* **OpenVSwitch Database**: Node-specific OVS database and flow tables +* **Host Networking**: Node-specific routing tables, conntrack zones, and + kernel state + +This distributed architecture means that troubleshooting commands must be executed +on the specific node where the relevant data resides. + +### **Node-Targeted Command Execution** + +**Node Selection Strategy**: All tools requiring node-specific data accept a +`node_name` parameter as a required argument. The MCP server uses this +parameter to: + +1. **Pod Selection**: Locate the appropriate `ovnkube-node` pod running on the + specified node using `pod.spec.nodeName` matching +2. **Container Targeting**: Route OVN database commands to the correct + container within the ovnkube-node pod (e.g., `nb-ovsdb`, `sb-ovsdb` + containers) +3. **Execution Context**: Execute host-level commands via + `kubectl debug node/ --image <>` for direct host access + +**LLM Responsibility**: The LLM must determine the appropriate target node(s) +based on the troubleshooting context: + +* **Pod-specific issues**: Use the node where the problematic pod is scheduled + (`kubectl get pod -o wide`) +* **Network flow analysis**: Target nodes along the packet path (source node, + destination node, gateway nodes) +* **Cluster-wide analysis**: Potentially execute commands across multiple + nodes for correlation + +We need to account for testing the LLM Responsibility side which is not +something we can guarantee but something we are offloading to the LLM that we +won't solve. + +## Deployment Strategy + +### **Flexible Deployment Modes** + +The MCP server is designed for flexible deployment without requiring elaborate +cluster infrastructure. Multiple deployment modes support different use cases +and security requirements: + +**CLI Tool Mode (Simplest)**: + +* Run the MCP server binary directly on a machine with `kubectl` access + to the target cluster +* Server uses existing cluster credentials and executes commands via standard + CLI tools +* Suitable for both live cluster troubleshooting +* Offline data based troubleshooting analysis would still need corresponding + parser tools to be run locally where the debug artifact files are hosted +* No cluster deployment required - operates entirely through external API + access + +**Debug Container Mode**: + +* Package the MCP server as a container image that the LLM can select for + `kubectl debug node --image=` operations +* This custom debug image contains the MCP server binary along with all + necessary troubleshooting tools +* Reduces blast radius compared to using default debug images with full host + access +* The LLM chooses this image when it needs to execute commands requiring + direct host access + +**Future Considerations**: + +* More elaborate deployment patterns (DaemonSet, Deployment) can be considered + when we think of use cases beyond ovn-kubernetes developers + +## Testing Strategy + +**Unit Testing - MCP Server Tools**: + +* Straightforward validation of individual tool execution and parameter + handling. Use [mcp-inspector](https://github.com/modelcontextprotocol/inspector). +* Mock cluster responses to test command routing and error handling +* Verify security controls and command allowlisting functionality + +**Integration Testing - The Complex Challenge**: + +* **Real Failure Scenario Reproduction**: Design test scenarios based on past + bugs and commonly occurring incidents +* **Chaos Engineering Integration**: Implement controlled failure injection to + create realistic troubleshooting scenarios +* **LLM Reasoning Validation**: The most critical and challenging aspect - + verifying that the LLM can produce meaningful root cause analysis from tool + outputs + +### **Scenario-Based Test Design** + +**Historical Incident Replay**: + +* Collect must-gather and sos-report bundles from past bugs +* Use these as offline test datasets to validate LLM troubleshooting accuracy +* Build regression test suite ensuring consistent analysis quality over time + +**Synthetic Failure Scenarios**: + +Some examples include: +* Network policy and EgressIP misconfigurations +* Pod connectivity failures across nodes +* Gateway flow issues and routing problems +* OVN database inconsistencies specially around EgressIPs + +**LLM Capability Assessment**: + +* Measure accuracy of root cause identification - depends on how much + OVN-Kubernetes feature specific context it has +* Evaluate quality of troubleshooting step recommendations +* Test correlation of multi-layer data analysis +* Validate handling of incomplete or missing data scenarios + +### **Success Metrics** + +* **Accuracy**: Percentage of correct root cause identifications in known + failure scenarios +* **Completeness**: Coverage of troubleshooting steps recommended vs. manual + expert analysis +* **Efficiency**: Time reduction compared to manual troubleshooting workflows +* **Safety**: Verification that only read-only operations are executed as + intended + +## Documentation Details + +OKEP for the MCP Server will be on +[https://ovn-kubernetes.io/](https://ovn-kubernetes.io/) . All end-user +documentation will be on the new repo's docs folder. + +* **Getting Started Guide**: MCP client setup and initial configuration +* **Troubleshooting Scenarios**: Common use cases and example natural language + queries +* **Tool Reference**: Available tools and their capabilities across all stack + layers +* **Security Model**: Warning around security considerations +* **Deployment Options**: CLI mode vs. debug container mode setup instructions +* **Offline Analysis**: Must-gather and sos-report analysis workflows + +## Alternative Implementation Ideas + +### Idea0: Using Existing kubernetes-mcp-server with Generic Bash Execution + +**Approach**: Use the existing kubernetes-mcp-server's `pods_exec` tool to run +arbitrary bash commands like `ovn-nbctl show` or `ip route` directly through +`kubectl exec` or `kubectl debug` sessions, without building any specialized +tooling. + +**Rationale**: This approach would provide immediate access to all CLI tools +without any development effort, leveraging the LLM's knowledge of command +syntax to construct appropriate bash commands. + +**Why Discarded**: + +* **Security Risk**: Allowing arbitrary bash command execution creates + significant security vulnerabilities. There's no protection against + destructive commands like `ovn-nbctl set` operations that could modify live + database state. +* **Lack of Access Control**: No way to enforce read-only operations or + validate command parameters before execution. +* **Separation of Concerns**: The LLM should focus on analysis and + troubleshooting logic, not on understanding the security implications of + direct system access. +* **Blast Radius**: Any compromise or LLM hallucination could potentially + execute dangerous commands on production systems. + +The fundamental principle that "each layer knows best about how/what tools to +allow with proper read access" makes a controlled wrapper approach essential +rather than direct bash execution. + +### Idea1: Chosen Approach: Direct CLI Tool Exposure + +See the proposed solution section. + +### Idea2: libovsdb-client Golang Wrapper + +**Approach**: Build a Golang MCP server using `NewOVSDBClient` to directly +query OVSDB instances with proper RBAC controls implemented through OVSDB's +native access control mechanisms. + +**Advantages**: +* **High Reusability**: Could be shared across OVN, OVS, and OVN-Kubernetes + projects +* **Native RBAC**: Leverages OVSDB's built-in role-based access controls +* **Structured Output**: Returns structured data rather than CLI text parsing + +**Why Discarded**: +* **Deployment Model**: Would require running as a DaemonSet on each node or + shipping binaries to ovnkube-node pods +* **Scope Limitation**: Only addresses database access, missing ovn and ovs + flow trace simulation and host networking tools + +### Idea3: ovsdb-client Binary Wrapper + +**Approach**: Create a wrapper around the existing `ovsdb-client` binary +(owned by the OVS team) to provide structured database access. + +**Advantages**: +* **High Reusability**: Could be shared across OVN, OVS, and OVN-Kubernetes + projects +* **Native RBAC**: Leverages OVSDB's built-in role-based access controls + +**Why Discarded**: +* **Ownership**: We could argue this wrapper better belongs in the openvswitch + community then in OVN-Kubernetes org +* **Scope Limitation**: Only addresses database access, missing ovn and ovs + flow trace simulation and host networking tools +* In future once we reach out to OVN and OVS communities to see what + they plan to do, we could revisit it. + +### Idea4: Direct Database Access Wrapper + +**Approach**: Build wrapper for direct database read operations bypassing CLI +and client tools entirely. + +**Advantages**: +* **High Reusability**: Could be shared across OVN, OVS, and OVN-Kubernetes + projects + +**Why Discarded**: Building from scratch when there's no real need to - +existing CLI tools already provide all necessary functionality with proven +reliability. + +# Known Risks and Limitations + +* AI! We can trust it only as much as we can throw it. + * Quality of this troubleshooter depends on the LLM's intelligence + * Quality of the MCP Server itself is however in our own hands and can be + enhanced based on user experience +* Security! We know that we cannot fully eliminate the risk +* Performance/Scalability: MCPs are a relatively new concept. So aspects like + how many tools could we expose per server and upto what point it scales etc + are unknowns. We will need to try and test it in our PoCs as we develop this. + * With sending bulky logs and debug details we also have danger of context + window running out. We need to ensure we filter out relevant logs. + * Token consumption and context window bloating. + * Other potential problems we need to rule out during testing: + * Poor tool selection: LLMs struggle to choose the right tool from too many options + * Parameter hallucination: Agents invoke tools with incorrect or fabricated parameters + * Misinterpretation: Responses from tools are more likely to be misunderstood + * Attention spreading: The model's attention gets distributed thinly across many options diff --git a/docs/okeps/okep-5552-dynamic-udn-node-allocation.md b/docs/okeps/okep-5552-dynamic-udn-node-allocation.md new file mode 100644 index 0000000000..1a68b9efb6 --- /dev/null +++ b/docs/okeps/okep-5552-dynamic-udn-node-allocation.md @@ -0,0 +1,205 @@ +# OKEP-5552: Dynamic UDN Node Allocation + +* Issue: [#5552](https://github.com/ovn-org/ovn-kubernetes/issues/5552) + +## Problem Statement + +When scaling UDNs, the control-plane cost of rendering a topology is high. This is the core limiting factor to +being able to scale to 1000s of UDNs. While there are plans to also improve network controller performance with UDNs, +there is still valuable savings to be had by not rendering UDNs on nodes where they are not needed. + +An example use case where this makes sense is when a Kubernetes cluster has its node resources segmented per tenant. In +this case, it only makes sense to run the tenant network (UDN) on the nodes where a tenant is allowed to run pods. This +allows for horizontal scaling to much higher number of overall UDNs running in a cluster. + +## Goals + + * To dynamically allow the network to only be rendered on specific nodes. + * To increase overall scalability of the number UDNs in a Kubernetes cluster with this solution. + * To increase the efficiency of ovnkube operations on nodes where a UDN exists, but is not needed. + +## Non-Goals + + * To fully solve control plane performance issues with UDNs. There will be several other fixes done to address that + outside of this enhancement. + * To provide any type of network security guarantee about exposing UDNs to limited subset of nodes. + +## Future Goals + + * Potentially enabling this feature on a per UDN basis, rather than globally. + +## Introduction + +The purpose of this feature is to add a configuration knob that users can turn on which will only render UDNs on nodes +where pods exist on that UDN. This feature will allow for higher overall UDN scale and less per-node control plane resource usage +under conditions where clusters do not have pods on every node, with connections to all UDNs. For example, if I have +1000 UDNs and 500 nodes, if a particular node only has pods connected to say 200 of those UDNs, then my node is only +responsible for rendering 200 UDNs instead of 1000 UDNs as it does today. + +This can provide significant control plane savings, but comes at a cost. Using the previous example, if a pod is now +launched in UDN 201, the node will have to render UDN 201 before the pod can be wired. In other words, this introduces +a one time larger pod latency cost for the first pod wired to the UDN. Additionally, there are more tradeoffs with other +feature limitations outlined later in this document. + +## User-Stories/Use-Cases + +Story 1: Segment groups of nodes per tenant + +As a cluster admin, I plan to dedicate groups of nodes to either a single tenant or small group of tenants. I plan +to create a CUDN per tenant, which means my network will only really need to exist on this group of nodes. I would +like to be able to limit this network to only be rendered on that subset of nodes. +This way I will be able to have less resource overhead from OVN-Kubernetes on each node, +and be able to scale to a higher number of UDNs in my cluster. + +## Proposed Solution + +The proposed solution is to add a configuration knob to OVN-Kubernetes, "--dynamic-udn-allocation", which will enable +this feature. Once enabled, NADs derived from CUDNs and UDNs will only be rendered on nodes where there is a pod +scheduled in that respective network. Additionally, if the node is scheduled as an Egress IP Node for a UDN, this node +will also render the UDN. + +When the last pod on the network is deleted from a node, OVNK will not immediately tear down the UDN. +Instead, OVNK will rely on a dead timer to expire to conclude that this UDN is no longer in use and +may be removed. This timer will also be configurable in OVN-Kubernetes as "--udn-deletion-grace-period". + +### API Details + +There will be no API changes. There will be new status conditions introduced in the section below. + +### Implementation Details + +In OVN-Kubernetes we have three main controllers that handle rendering of networking features for UDNs. They exist as + - Cluster Manager - runs on the control-plane, handles cluster-wide allocation, rendering of CUDN/UDNs + - Controller Manager - runs on a per-zone basis, handles configuring OVN for all networking features + - Node Controller Manager - runs on a per-node basis, handles configuring node specific things like nftables, VRFs, etc. + +With this change, Cluster Manger will be largely untouched, while Controller Manager and Node Controller Manager will be +modified in a few places to filter out rendering UDNs when a pod doesn't exist. + +#### Internal Controller Details + +In OVN-Kubernetes we have many controllers that handle features for different networks, encompassed under three +controller manager containers. The breakdown of how these will be modified is outlined below: + +* Cluster Manager + * UDN Controller — No change + * Route Advertisements Controller — No change + * Egress Service Cluster — Doesn't support UDN + * Endpoint Mirror Controller — No change + * EgressIP Controller — No change + * Unidling Controller — No change + * DNS Resolver — No change + * Network Cluster Controller — Modified to report status and exclude nodes not serving the UDN +* Controller Manager (ovnkube-controller) + * Default Network — No change + * NAD Controller — Ignore NADs for UDNs that are not active on this node (no pods for the UDN and not an EIP node) +* Node Controller Manager + * Default Network — No change + * NAD Controller — Ignore NADs for UDNs that are not active on this node (no pods for the UDN and not an EIP node) + +The resulting NAD Controller change will filter out NADs that do not apply to this node, stopping NAD keys from being +enqueued to the Controller Manager/Node Controller Manager's Network Manager. Those Controller Managers will not need +to create or run any sub-controllers for nodes that do not have the network. To do this cleanly, NAD Controller will be +modified to hold a filterFunc field, which the respective controller manager can set in order to filter out NADs. For +Cluster Manager, this function will not apply, but for Controller Manager and Node Controller Manager it will be a function +that filters based on if the UDN is serving pods on this node. + +#### New Pod/EgressIP Tracker Controller + +In order to know whether the Managers should filter out a UDN, a pod controller and egress IP controller will be used +in the Managers to track this information in memory. The pod controller will be a new level driven controller for +each manager. For Egress IP, another new controller will be introduced that watches EgressIPs, Namespaces, and NADs in +order to track which NAD maps to a node serving Egress IP. + +When Managers are created, they will start these Pod/EgressIP Tracker Controllers, and set a filterFunc on NAD Controller. +The filterFunc will query the aforementioned controllers to determine if the NAD being synced matches the local node. If +not, then NADController will not create the UDN controller for that network. + +Additionally, the Pod/EgressIP Tracker Controllers will expose a callback function, called "onNetworkRefChange". When +the first pod is detected as coming up on a node + NAD combination, or the node activates as an Egress IP node for the +first time, onNetworkRefChange will be triggered, which allows a callback mechanism to be leveraged for events. The +Controller Manager and Node Controller Manager will leverage this callback, so that they can trigger NAD Controller to +reconcile the NAD for these events. This is important as it provides a way to signal that NADController should remove +a UDN controller if it is no longer active, or alternatively, force the NAD Controller to reconcile a UDN Controller if for example, +a new remote node has activated. + +#### Other Controller Changes + +The Layer3 network controller will need to filter out nodes where the UDN is not rendered. Upon receiving events, +they will query a Manager function called NodeHasNAD. Managers will export a Tracker interface, that only contains this +method for UDN Controllers to query. The implementation of NodeHasNAD will rely on the Manager querying their pod and +egress IP trackers. + +Upon UDN activation of a remote node, these controllers will need to receive events in order to reconcile the new remote node. +To do this, the corresponding tracker will trigger its callback, "onNetworkRefChange". That will trigger the Manager +to ask NAD Controller to reconcile the UDN controller belonging to this NAD. Once that Layer 3 UDN controller reconciles, +it will walk nodes and determine what needs to be added or removed. It will take the applicable nodes, set their +syncZoneICFailed status, then immediately queue the objects to the retry framework with no backoff. This will allow +the Zone IC (ZIC) controller to properly configure the transit switch with the remote peers, or tear it down, if necessary. + +#### Status Condition and Metric Changes + +A new status condition will be added to CUDN/UDN that will indicate how many nodes are selected for a network: +```yaml +status: + conditions: + - type: NodesSelected + status: "True" + reason: DynamicAllocation + message: "5 nodes rendered with network" + lastTransitionTime: 2025-09-22T20:10:00Z +``` + +If the status is "False", then no nodes are currently allocated for the network - no pods or egress IPs assigned. + +Cluster Manager will leverage instances of EgressIP and Pod Trackers in order to use that data for updating this status. +The nodes serving a network are defined as a node with at least one OVN networked pod or having an Egress IP assigned to +it on a NAD that maps to a UDN or CUDN. + +Additionally, events will be posted to the corresponding UDN/CUDN when nodes have become active or inactive for +a node. This was chosen instead of doing per node status events, as that can lead to scale issues. Using events provides +the audit trail, without those scale implications. The one drawback of this approach pertains to UDN deactivation. There +is an "udn-deletion-grace-period" timer used to delay deactivation of a UDN on a node. This is to prevent churn if a pod +is deleted, then almost immediately re-added. Without storing the timestamp in the API, we are relying internally on in +memory data. While this is fine for normal operation, if OVN-Kube pod restarts, we lose that context. However, this should +be fine as when we restart we have to walk and start all network controllers anyway, so we are not really creating a lot of +extra work for OVN-Kube here. + +A metric will also be exposed which allows the user to track over time how many nodes were active for a particular +network. + +### Testing Details + +* Unit Tests will be added to ensure the behavior works as expected, including checking that +OVN switches/routers are not created there is no pod/egress IP active on the node, etc. +* E2E Tests will be added to create a CUDN/UDN with the feature enabled and ensure pod traffic works correctly between nodes. +* Benchmark/Scale testing will be done to show the resource savings of 1000s of nodes with 1000s of UDNs. + +### Documentation Details + +* User-Defined Network feature documentation will be updated with a user guide for this new feature. + +## Risks, Known Limitations and Mitigations + +Risks: + * Additional first-pod cold start latency per UDN/node. Could impact pod readiness SLOs. + * Burst reconcile load on large rollouts of pods on inactive nodes. + +Limitations: + * No OVN central support. + * NodePort/ExternalIP services with external traffic policy mode "cluster", will not work when sending traffic to inactive nodes. + * MetalLB must be configured on nodes where the UDN is rendered. This can be achieved by scheduling a daemonset for the designated nodes on the UDN. + +## OVN Kubernetes Version Skew + +Targeted for release 1.2. + +## Alternatives + +Specifying a NodeSelector in the CUDN/UDN CRD in order to determine where a network should be rendered. This was the +initial idea of this enhancement, but was evaluated as less desirable than dynamic allocation. The dynamic allocation +provides more flexibility without a user/admin needing to intervene and update a CRD. + +## References + +None diff --git a/go-controller/Makefile b/go-controller/Makefile index 27ebf94c8b..8224451f9e 100644 --- a/go-controller/Makefile +++ b/go-controller/Makefile @@ -102,7 +102,8 @@ ifeq ($(CONTAINER_RUNNABLE), 0) @GOPATH=${GOPATH} ./hack/lint.sh $(CONTAINER_RUNTIME) fix || { echo "ERROR: lint fix failed! There is a bug that changes file ownership to root \ when this happens. To fix it, simply run 'chown -R : *' from the repo root."; exit 1; } else - echo "linter can only be run within a container since it needs a specific golangci-lint version"; exit 1 + echo "no container runtime. attempting to run natively"; + @GOPATH=${GOPATH} ./hack/lint.sh run-natively || { echo "running lint locally failed!"; exit 1; endif gofmt: diff --git a/go-controller/hack/lint.sh b/go-controller/hack/lint.sh index 92863e6b5c..09b47c3911 100755 --- a/go-controller/hack/lint.sh +++ b/go-controller/hack/lint.sh @@ -1,18 +1,28 @@ #!/usr/bin/env bash VERSION=v1.64.8 -extra_flags="" +: "${GOLANGCI_LINT_VERSION:=$VERSION}" +extra_flags=(--verbose --print-resources-usage --modules-download-mode=vendor --timeout=15m0s) if [ "$#" -ne 1 ]; then if [ "$#" -eq 2 ] && [ "$2" == "fix" ]; then - extra_flags="--fix" + extra_flags+=(--fix) else - echo "Expected command line argument - container runtime (docker/podman) got $# arguments: $@" + echo "Expected command line argument - container runtime (docker/podman) or 'run-natively'; got $# arguments: $*" exit 1 fi fi -$1 run --security-opt label=disable --rm \ - -v ${HOME}/.cache/golangci-lint:/cache -e GOLANGCI_LINT_CACHE=/cache \ - -v $(pwd):/app -w /app -e GO111MODULE=on docker.io/golangci/golangci-lint:${VERSION} \ - golangci-lint run --verbose --print-resources-usage \ - --modules-download-mode=vendor --timeout=15m0s ${extra_flags} && \ - echo "lint OK!" \ No newline at end of file +if [ "$1" = "run-natively" ]; then + mkdir -p /tmp/local/bin/ + curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b /tmp/local/bin/ "${GOLANGCI_LINT_VERSION}" + mkdir -p /tmp/golangci-cache + export GOLANGCI_LINT_CACHE=/tmp/golangci-cache + /tmp/local/bin/golangci-lint run "${extra_flags[@]}" && \ + echo "lint OK!" +else + $1 run --security-opt label=disable --rm \ + -v "${HOME}"/.cache/golangci-lint:/cache -e GOLANGCI_LINT_CACHE=/cache \ + -v "$(pwd)":/app -w /app -e GO111MODULE=on docker.io/golangci/golangci-lint:"${VERSION}" \ + golangci-lint run "${extra_flags[@]}" && \ + echo "lint OK!" +fi + diff --git a/go-controller/pkg/clustermanager/egressip_controller.go b/go-controller/pkg/clustermanager/egressip_controller.go index 4cbd00d18f..70ec7a58e2 100644 --- a/go-controller/pkg/clustermanager/egressip_controller.go +++ b/go-controller/pkg/clustermanager/egressip_controller.go @@ -129,6 +129,15 @@ func (eIPC *egressIPClusterController) getAllocationTotalCount() float64 { return float64(count) } +func (e *egressNode) hasAllocatedEgressIP(name string, eip string) bool { + for ip, egressIPName := range e.allocations { + if egressIPName == name && ip == eip { + return true + } + } + return false +} + // nodeAllocator contains all the information required to manage EgressIP assignment to egress node. This includes assignment // of EgressIP IPs to nodes and ensuring the egress nodes are reachable. For cloud nodes, it also tracks limits for // IP assignment to each node. @@ -865,6 +874,7 @@ func (eIPC *egressIPClusterController) addAllocatorEgressIPAssignments(name stri defer eIPC.nodeAllocator.Unlock() for _, status := range statusAssignments { if eNode, exists := eIPC.nodeAllocator.cache[status.Node]; exists { + klog.V(5).Infof("Setting egress IP node allocation - node: %s, EIP name: %s, IP: %s", eNode.name, name, status.EgressIP) eNode.allocations[status.EgressIP] = name } } @@ -1423,6 +1433,10 @@ func (eIPC *egressIPClusterController) validateEgressIPStatus(name string, items klog.Errorf("Allocator error: EgressIP: %s claims multiple egress IPs on same node: %s, will attempt rebalancing", name, eIPStatus.Node) validAssignment = false } + if !eNode.hasAllocatedEgressIP(name, eIPStatus.EgressIP) { + klog.Errorf("Allocator error: EgressIP: %s has mistmach with status vs cache for node: %s with IP: %s", name, eIPStatus.Node, eIPStatus.EgressIP) + validAssignment = false + } if !eNode.isEgressAssignable { klog.Errorf("Allocator error: EgressIP: %s assigned to node: %s which does not have egress label, will attempt rebalancing", name, eIPStatus.Node) validAssignment = false diff --git a/go-controller/pkg/clustermanager/egressip_controller_test.go b/go-controller/pkg/clustermanager/egressip_controller_test.go index 7f47c2d25d..ed64e59740 100644 --- a/go-controller/pkg/clustermanager/egressip_controller_test.go +++ b/go-controller/pkg/clustermanager/egressip_controller_test.go @@ -3,8 +3,12 @@ package clustermanager import ( "context" "fmt" + "maps" "net" + "reflect" + "slices" "strconv" + "strings" "sync" "time" @@ -296,6 +300,62 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { return nodes[0] } + egressIPsMatch := func(expectedEgressIPs []egressipv1.EgressIP) func() bool { + return func() (result bool) { + egressIPList, err := fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().List(context.TODO(), metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gotEgressIPs := egressIPList.Items + + defer func() { + if !result { + ginkgo.GinkgoWriter.Printf("Mismatch in egressIP lists, got: %+v, expected: %+v\n", gotEgressIPs, expectedEgressIPs) + } + }() + + if len(expectedEgressIPs) != len(gotEgressIPs) { + return false + } + + numEquals := 0 + for _, expectedEgressIP := range expectedEgressIPs { + for _, gotEgressIP := range gotEgressIPs { + if expectedEgressIP.Name != gotEgressIP.Name { + continue + } + + specExpectedEIP := slices.Clone(expectedEgressIP.Spec.EgressIPs) + specGotEgressIP := slices.Clone(gotEgressIP.Spec.EgressIPs) + slices.Sort(specExpectedEIP) + slices.Sort(specGotEgressIP) + if !reflect.DeepEqual(specExpectedEIP, specGotEgressIP) { + return false + } + + statusExpectedEgressIP := slices.Clone(expectedEgressIP.Status.Items) + statusGotEgressIP := slices.Clone(gotEgressIP.Status.Items) + sortFunc := func(a, b egressipv1.EgressIPStatusItem) int { + return strings.Compare(a.EgressIP, b.EgressIP) + } + slices.SortFunc(statusExpectedEgressIP, sortFunc) + slices.SortFunc(statusGotEgressIP, sortFunc) + if !reflect.DeepEqual(statusExpectedEgressIP, statusGotEgressIP) { + return false + } + numEquals++ + } + } + return len(expectedEgressIPs) == numEquals + } + } + + readAllocations := func(nodeName string) map[string]string { + fakeClusterManagerOVN.eIPC.nodeAllocator.Lock() + defer fakeClusterManagerOVN.eIPC.nodeAllocator.Unlock() + node, ok := fakeClusterManagerOVN.eIPC.nodeAllocator.cache[nodeName] + gomega.Expect(ok).To(gomega.BeTrue(), fmt.Sprintf("node %s missing from allocator cache", nodeName)) + return maps.Clone(node.allocations) + } + ginkgo.BeforeEach(func() { // Restore global default values before each testcase gomega.Expect(config.PrepareTestConfig()).To(gomega.Succeed()) @@ -3264,15 +3324,21 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { ginkgo.Context("syncEgressIP for dual-stack", func() { - ginkgo.It("should not update valid assignments", func() { - // FIXME(mk): this test doesn't ensure that the EIP status does not get patched during the test run - // and therefore is an invalid test to test that the status is not patched + // This test validates that if the allocator cache contains valid entries that match + // the egress IP status items, no reassignment shall happen. + // In order to do so, it does 2 comparisons: + // a) take egressIP that's passed to the cluster manager and compare it to the list of EgressIPs after running + // WatchEgressIP. + // b) take egressNode.allocations before running WatchEgressIP and make sure that they haven't changed. + ginkgo.It("should not reallocate if cache matches EgressIP status", func() { app.Action = func(*cli.Context) error { config.IPv6Mode = true config.IPv4Mode = true egressIPv4 := "192.168.126.101" egressIPv6 := "0:0:0:0:0:feff:c0a8:8e0d" - node1IPv6 := "0:0:0:0:0:feff:c0a8:8e0c/64" + node1IPv6 := "0:0:0:0:0:feff:c0a8:8e0b/64" + node2IPv6 := "0:0:0:0:0:feff:c0a8:8e0c/64" + node1IPv4 := "192.168.126.50/16" node2IPv4 := "192.168.126.51/16" node1 := corev1.Node{ @@ -3318,8 +3384,15 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { }, } - egressNode1 := setupNode(node1Name, []string{node1IPv6}, map[string]string{}) - egressNode2 := setupNode(node2Name, []string{node2IPv4}, map[string]string{"192.168.126.102": "bogus3"}) + egressNode1 := setupNode(node1Name, []string{node1IPv4, node1IPv6}, map[string]string{ + net.ParseIP(egressIPv6).String(): egressIPName, // Cache matches eIP Status. + }) + egressNode2 := setupNode(node2Name, []string{node2IPv4, node2IPv6}, map[string]string{ + "192.168.126.102": "bogus3", + egressIPv4: egressIPName, // Cache matches eIP Status. + }) + egressNode1OriginalAllocations := maps.Clone(egressNode1.allocations) + egressNode2OriginalAllocations := maps.Clone(egressNode2.allocations) eIP := egressipv1.EgressIP{ ObjectMeta: newEgressIPMeta(egressIPName), @@ -3350,11 +3423,11 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { _, err := fakeClusterManagerOVN.eIPC.WatchEgressIP() gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - gomega.Eventually(getEgressIPStatusLen(egressIPName)).Should(gomega.Equal(2)) - egressIPs, nodes := getEgressIPStatus(egressIPName) - gomega.Expect(nodes).To(gomega.ConsistOf(eIP.Status.Items[0].Node, eIP.Status.Items[1].Node)) - gomega.Expect(egressIPs).To(gomega.ConsistOf(eIP.Status.Items[0].EgressIP, eIP.Status.Items[1].EgressIP)) + gomega.Eventually(egressIPsMatch([]egressipv1.EgressIP{eIP})).Should(gomega.BeTrue()) + gomega.Eventually(readAllocations).WithArguments(egressNode1.name).Should( + gomega.Equal(egressNode1OriginalAllocations)) + gomega.Eventually(readAllocations).WithArguments(egressNode2.name).Should( + gomega.Equal(egressNode2OriginalAllocations)) return nil } @@ -3453,6 +3526,50 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { egressIPs, nodes := getEgressIPStatus(egressIPName) gomega.Expect(nodes).To(gomega.ConsistOf(egressNode1.name, egressNode2.name)) gomega.Expect(egressIPs).To(gomega.ConsistOf(eIP.Status.Items[0].EgressIP, eIP.Status.Items[1].EgressIP)) + // give some time for event handler to be added and finish processing initial updates + time.Sleep(3 * time.Second) + realEIP, err := fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().Get(context.TODO(), egressIPName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Simulate lagging informer and send an event with outdated status") + eIP.Annotations = make(map[string]string) + for k, v := range realEIP.Annotations { + eIP.Annotations[k] = v + } + eIP.Status = egressipv1.EgressIPStatus{ + Items: []egressipv1.EgressIPStatusItem{ + { + EgressIP: egressIP2, + Node: egressNode1.name, + }, + { + EgressIP: egressIP1, + Node: egressNode1.name, + }, + }, + } + _, err = fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().Update(context.TODO(), &eIP, metav1.UpdateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // give sometime for events to be processed + time.Sleep(3 * time.Second) + ginkgo.By("Simulate lagging informer and send an event with real updated status") + _, err = fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().Update(context.TODO(), realEIP, metav1.UpdateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // give sometime for events to be processed + time.Sleep(3 * time.Second) + gomega.Eventually(func() error { + defer ginkgo.GinkgoRecover() + tmp, err := fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().Get(context.TODO(), egressIPName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + var egressIPs, nodes []string + for _, status := range tmp.Status.Items { + egressIPs = append(egressIPs, status.EgressIP) + nodes = append(nodes, status.Node) + } + gomega.Expect(nodes).To(gomega.ConsistOf(egressNode1.name, egressNode2.name)) + gomega.Expect(egressIPs).To(gomega.ConsistOf(eIP.Status.Items[0].EgressIP, eIP.Status.Items[1].EgressIP)) + return nil + }).Should(gomega.Succeed()) + return nil } @@ -3956,7 +4073,13 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { gomega.Expect(err).NotTo(gomega.HaveOccurred()) }) - ginkgo.It("should not update valid assignment", func() { + // This test validates that if the allocator cache contains valid entries that match + // the egress IP status items, no reassignment shall happen. + // In order to do so, it does 2 comparisons: + // a) take egressIP that's passed to the cluster manager and compare it to the list of EgressIPs after running + // WatchEgressIP. + // b) take egressNode.allocations before running WatchEgressIP and make sure that they haven't changed. + ginkgo.It("should not reallocate if cache matches EgressIP status", func() { app.Action = func(*cli.Context) error { egressIP1 := "192.168.126.101" node1IPv4 := "192.168.126.12/24" @@ -4005,8 +4128,15 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { }, } - egressNode1 := setupNode(node1Name, []string{"192.168.126.12/24"}, map[string]string{"192.168.126.111": "bogus2"}) - egressNode2 := setupNode(node2Name, []string{"192.168.126.51/24"}, map[string]string{"192.168.126.68": "bogus3"}) + egressNode1 := setupNode(node1Name, []string{"192.168.126.12/24"}, map[string]string{ + "192.168.126.111": "bogus2", + egressIP1: egressIPName, // Cache matches eIP Status. + }) + egressNode2 := setupNode(node2Name, []string{"192.168.126.51/24"}, map[string]string{ + "192.168.126.68": "bogus3", + }) + egressNode1OriginalAllocations := maps.Clone(egressNode1.allocations) + egressNode2OriginalAllocations := maps.Clone(egressNode2.allocations) eIP := egressipv1.EgressIP{ ObjectMeta: newEgressIPMeta(egressIPName), @@ -4033,10 +4163,12 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { _, err := fakeClusterManagerOVN.eIPC.WatchEgressIP() gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Eventually(getEgressIPStatusLen(egressIPName)).Should(gomega.Equal(1)) - egressIPs, nodes := getEgressIPStatus(egressIPName) - gomega.Expect(nodes[0]).To(gomega.Equal(egressNode1.name)) - gomega.Expect(egressIPs[0]).To(gomega.Equal(egressIP1)) + + gomega.Eventually(egressIPsMatch([]egressipv1.EgressIP{eIP})).Should(gomega.BeTrue()) + gomega.Eventually(readAllocations).WithArguments(egressNode1.name).Should( + gomega.Equal(egressNode1OriginalAllocations)) + gomega.Eventually(readAllocations).WithArguments(egressNode2.name).Should( + gomega.Equal(egressNode2OriginalAllocations)) return nil } diff --git a/go-controller/pkg/controllermanager/node_controller_manager.go b/go-controller/pkg/controllermanager/node_controller_manager.go index e183e3b3fc..34cfda589e 100644 --- a/go-controller/pkg/controllermanager/node_controller_manager.go +++ b/go-controller/pkg/controllermanager/node_controller_manager.go @@ -75,14 +75,18 @@ func (ncm *NodeControllerManager) CleanupStaleNetworks(validNetworks ...util.Net if !util.IsNetworkSegmentationSupportEnabled() { return nil } - validVRFDevices := make(sets.Set[string]) - for _, network := range validNetworks { - if !network.IsPrimaryNetwork() { - continue + // in DPU mode, vrfManager would be nil + if ncm.vrfManager != nil { + validVRFDevices := make(sets.Set[string]) + for _, network := range validNetworks { + if !network.IsPrimaryNetwork() { + continue + } + validVRFDevices.Insert(util.GetNetworkVRFName(network)) } - validVRFDevices.Insert(util.GetNetworkVRFName(network)) + return ncm.vrfManager.Repair(validVRFDevices) } - return ncm.vrfManager.Repair(validVRFDevices) + return nil } // newCommonNetworkControllerInfo creates and returns the base node network controller info @@ -126,7 +130,7 @@ func NewNodeControllerManager(ovnClient *util.OVNClientset, wf factory.NodeWatch return nil, err } } - if util.IsNetworkSegmentationSupportEnabled() { + if util.IsNetworkSegmentationSupportEnabled() && config.OvnKubeNode.Mode != ovntypes.NodeModeDPU { ncm.vrfManager = vrfmanager.NewController(ncm.routeManager) ncm.ruleManager = iprulemanager.NewController(config.IPv4Mode, config.IPv6Mode) } diff --git a/go-controller/pkg/crd/userdefinednetwork/v1/shared.go b/go-controller/pkg/crd/userdefinednetwork/v1/shared.go index 9787415ddc..ac1a4c95ca 100644 --- a/go-controller/pkg/crd/userdefinednetwork/v1/shared.go +++ b/go-controller/pkg/crd/userdefinednetwork/v1/shared.go @@ -102,6 +102,8 @@ type Layer3Subnet struct { // +kubebuilder:validation:XValidation:rule="!has(self.reservedSubnets) || self.reservedSubnets.all(e, self.subnets.exists(s, cidr(s).containsCIDR(cidr(e))))",message="reservedSubnets must be subnetworks of the networks specified in the subnets field",fieldPath=".reservedSubnets" // +kubebuilder:validation:XValidation:rule="!has(self.infrastructureSubnets) || self.infrastructureSubnets.all(e, self.subnets.exists(s, cidr(s).containsCIDR(cidr(e))))",message="infrastructureSubnets must be subnetworks of the networks specified in the subnets field",fieldPath=".infrastructureSubnets" // +kubebuilder:validation:XValidation:rule="!has(self.infrastructureSubnets) || !has(self.reservedSubnets) || self.infrastructureSubnets.all(infra, !self.reservedSubnets.exists(reserved, cidr(infra).containsCIDR(reserved) || cidr(reserved).containsCIDR(infra)))", message="infrastructureSubnets and reservedSubnets must not overlap" +// +kubebuilder:validation:XValidation:rule="!has(self.infrastructureSubnets) || self.infrastructureSubnets.all(s, isCIDR(s) && cidr(s) == cidr(s).masked())", message="infrastructureSubnets must be a masked network address (no host bits set)" +// +kubebuilder:validation:XValidation:rule="!has(self.reservedSubnets) || self.reservedSubnets.all(s, isCIDR(s) && cidr(s) == cidr(s).masked())", message="reservedSubnets must be a masked network address (no host bits set)" type Layer2Config struct { // Role describes the network role in the pod. // diff --git a/go-controller/pkg/node/bridgeconfig/bridgeflows.go b/go-controller/pkg/node/bridgeconfig/bridgeflows.go index 76bc6ef597..78cf8a2c42 100644 --- a/go-controller/pkg/node/bridgeconfig/bridgeflows.go +++ b/go-controller/pkg/node/bridgeconfig/bridgeflows.go @@ -411,9 +411,9 @@ func (b *BridgeConfiguration) flowsForDefaultBridge(extraIPs []net.IP) ([]string dftFlows = append(dftFlows, fmt.Sprintf("cookie=%s, priority=250, table=2, %s, pkt_mark=%s, "+ - "actions=set_field:%s->eth_dst,output:%s", + "actions=set_field:%s->eth_dst,%soutput:%s", nodetypes.DefaultOpenFlowCookie, protoPrefixV4, netConfig.PktMark, - bridgeMacAddress, netConfig.OfPortPatch)) + bridgeMacAddress, mod_vlan_id, netConfig.OfPortPatch)) } } @@ -448,9 +448,9 @@ func (b *BridgeConfiguration) flowsForDefaultBridge(extraIPs []net.IP) ([]string netConfig.V6MasqIPs.ManagementPort.IP.String())) dftFlows = append(dftFlows, fmt.Sprintf("cookie=%s, priority=250, table=2, %s, pkt_mark=%s, "+ - "actions=set_field:%s->eth_dst,output:%s", + "actions=set_field:%s->eth_dst,%soutput:%s", nodetypes.DefaultOpenFlowCookie, protoPrefixV6, netConfig.PktMark, - bridgeMacAddress, netConfig.OfPortPatch)) + bridgeMacAddress, mod_vlan_id, netConfig.OfPortPatch)) } } diff --git a/go-controller/pkg/node/controllers/egressip/egressip.go b/go-controller/pkg/node/controllers/egressip/egressip.go index 3c6b340bbf..dd1f15f3c9 100644 --- a/go-controller/pkg/node/controllers/egressip/egressip.go +++ b/go-controller/pkg/node/controllers/egressip/egressip.go @@ -45,6 +45,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/syncmap" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/egressip" utilerrors "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/errors" ) @@ -539,15 +540,15 @@ func (c *Controller) processEIP(eip *eipv1.EgressIP) (*eIPConfig, sets.Set[strin if isValid := isEIPStatusItemValid(status, c.nodeName); !isValid { continue } - eIPNet, err := util.GetIPNetFullMask(status.EgressIP) - if err != nil { + ip := net.ParseIP(status.EgressIP) + if ip == nil { return nil, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, - fmt.Errorf("failed to generate mask for EgressIP %s IP %s: %v", eip.Name, status.EgressIP, err) + fmt.Errorf("failed to parse EgressIP %s IP %s", eip.Name, status.EgressIP) } - if util.IsOVNNetwork(parsedNodeEIPConfig, eIPNet.IP) { + if util.IsOVNNetwork(parsedNodeEIPConfig, ip) { continue } - found, link, err := findLinkOnSameNetworkAsIP(eIPNet.IP, c.v4, c.v6) + found, link, err := findLinkOnSameNetworkAsIP(ip, c.v4, c.v6) if err != nil { return nil, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, fmt.Errorf("failed to find a network to host EgressIP %s IP %s: %v", eip.Name, status.EgressIP, err) @@ -560,7 +561,7 @@ func (c *Controller) processEIP(eip *eipv1.EgressIP) (*eIPConfig, sets.Set[strin if err != nil { return nil, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, fmt.Errorf("failed to list namespaces: %w", err) } - isEIPV6 := utilnet.IsIPv6(eIPNet.IP) + isEIPV6 := utilnet.IsIPv6(ip) for _, namespace := range namespaces { netInfo, err := c.getActiveNetworkForNamespace(namespace.Name) if err != nil { @@ -593,13 +594,13 @@ func (c *Controller) processEIP(eip *eipv1.EgressIP) (*eIPConfig, sets.Set[strin if selectedNamespacesPodIPs[namespace.Name] == nil { selectedNamespacesPodIPs[namespace.Name] = make(map[ktypes.NamespacedName]*podIPConfigList) } - selectedNamespacesPodIPs[namespace.Name][podNamespaceName] = generatePodConfig(ips, link, eIPNet, isEIPV6) + selectedNamespacesPodIPs[namespace.Name][podNamespaceName] = generatePodConfig(ips, link, ip, isEIPV6) selectedPods.Insert(podNamespaceName) } } // ensure at least one pod is selected before generating config if len(selectedNamespacesPodIPs) > 0 { - eipSpecificConfig, err = generateEIPConfig(link, eIPNet, isEIPV6) + eipSpecificConfig, err = generateEIPConfig(link, ip, isEIPV6) if err != nil { return nil, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, fmt.Errorf("failed to generate EIP configuration for EgressIP %s IP %s: %v", eip.Name, status.EgressIP, err) @@ -611,7 +612,7 @@ func (c *Controller) processEIP(eip *eipv1.EgressIP) (*eIPConfig, sets.Set[strin return eipSpecificConfig, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, nil } -func generatePodConfig(podIPs []net.IP, link netlink.Link, eIPNet *net.IPNet, isEIPV6 bool) *podIPConfigList { +func generatePodConfig(podIPs []net.IP, link netlink.Link, eIP net.IP, isEIPV6 bool) *podIPConfigList { newPodIPConfigs := newPodIPConfigList() for _, podIP := range podIPs { isPodIPv6 := utilnet.IsIPv6(podIP) @@ -619,7 +620,7 @@ func generatePodConfig(podIPs []net.IP, link netlink.Link, eIPNet *net.IPNet, is continue } ipConfig := newPodIPConfig() - ipConfig.ipTableRule = generateIPTablesSNATRuleArg(podIP, isPodIPv6, link.Attrs().Name, eIPNet.IP.String()) + ipConfig.ipTableRule = generateIPTablesSNATRuleArg(podIP, isPodIPv6, link.Attrs().Name, eIP.String()) ipConfig.ipRule = generateIPRule(podIP, isPodIPv6, link.Attrs().Index) ipConfig.v6 = isPodIPv6 newPodIPConfigs.elems = append(newPodIPConfigs.elems, ipConfig) @@ -628,14 +629,14 @@ func generatePodConfig(podIPs []net.IP, link netlink.Link, eIPNet *net.IPNet, is } // generateEIPConfig generates configuration that isn't related to any pod EIPs to support config of a single EIP -func generateEIPConfig(link netlink.Link, eIPNet *net.IPNet, isEIPV6 bool) (*eIPConfig, error) { +func generateEIPConfig(link netlink.Link, eIP net.IP, isEIPV6 bool) (*eIPConfig, error) { eipConfig := newEIPConfig() linkRoutes, err := generateRoutesForLink(link, isEIPV6) if err != nil { return nil, err } eipConfig.routes = linkRoutes - eipConfig.addr = getNetlinkAddress(eIPNet, link.Attrs().Index) + eipConfig.addr = egressip.GetNetlinkAddress(eIP, link.Attrs().Index) return eipConfig, nil } @@ -1482,14 +1483,6 @@ func isLinkUp(flags string) bool { return strings.Contains(flags, "up") } -func getNetlinkAddress(addr *net.IPNet, ifindex int) *netlink.Addr { - return &netlink.Addr{ - IPNet: addr, - Scope: int(netlink.SCOPE_UNIVERSE), - LinkIndex: ifindex, - } -} - // generateIPRules generates IP rules at a predefined priority for each pod IP with a custom routing table based // from the links 'ifindex' func generateIPRule(srcIP net.IP, isIPv6 bool, ifIndex int) netlink.Rule { diff --git a/go-controller/pkg/node/default_node_network_controller.go b/go-controller/pkg/node/default_node_network_controller.go index 9e0d28b582..e3923fcc54 100644 --- a/go-controller/pkg/node/default_node_network_controller.go +++ b/go-controller/pkg/node/default_node_network_controller.go @@ -153,7 +153,7 @@ func newDefaultNodeNetworkController(cnnci *CommonNodeNetworkControllerInfo, sto routeManager: routeManager, ovsClient: ovsClient, } - if util.IsNetworkSegmentationSupportEnabled() { + if util.IsNetworkSegmentationSupportEnabled() && config.OvnKubeNode.Mode != types.NodeModeDPU { c.udnHostIsolationManager = NewUDNHostIsolationManager(config.IPv4Mode, config.IPv6Mode, cnnci.watchFactory.PodCoreInformer(), cnnci.name, cnnci.recorder) } @@ -936,6 +936,9 @@ func (nc *DefaultNodeNetworkController) Init(ctx context.Context) error { if err != nil { return err } + } + + if config.OvnKubeNode.Mode != types.NodeModeDPU { if nc.udnHostIsolationManager != nil { if err = nc.udnHostIsolationManager.Start(ctx); err != nil { return err @@ -980,9 +983,11 @@ func (nc *DefaultNodeNetworkController) Init(ctx context.Context) error { nodeAnnotator := kube.NewNodeAnnotator(nc.Kube, node.Name) - // Use the device from environment when the DP resource name is specified. - if err := configureMgmtPortNetdevFromResource(); err != nil { - return err + if config.OvnKubeNode.Mode != types.NodeModeDPU { + // Use the device from environment when the DP resource name is specified. + if err := configureMgmtPortNetdevFromResource(); err != nil { + return err + } } if config.OvnKubeNode.Mode == types.NodeModeDPUHost { @@ -1043,6 +1048,11 @@ func (nc *DefaultNodeNetworkController) Init(ctx context.Context) error { return err } nc.Gateway = gw + } else { + err = nc.initGatewayDPUHostPreStart(nc.nodeAddress, nodeAnnotator) + if err != nil { + return err + } } if err := level.Set(strconv.Itoa(config.Logging.Level)); err != nil { @@ -1081,7 +1091,7 @@ func (nc *DefaultNodeNetworkController) Start(ctx context.Context) error { // Complete gateway initialization if config.OvnKubeNode.Mode == types.NodeModeDPUHost { - err = nc.initGatewayDPUHost(nc.nodeAddress, nodeAnnotator) + err = nc.initGatewayDPUHost() if err != nil { return err } diff --git a/go-controller/pkg/node/egressip/gateway_egressip.go b/go-controller/pkg/node/egressip/gateway_egressip.go index 38bd2b058e..27700e026e 100644 --- a/go-controller/pkg/node/egressip/gateway_egressip.go +++ b/go-controller/pkg/node/egressip/gateway_egressip.go @@ -3,13 +3,9 @@ package egressip import ( "encoding/json" "fmt" - "math" "net" "sync" - "github.com/vishvananda/netlink" - "golang.org/x/sys/unix" - "k8s.io/apimachinery/pkg/util/sets" corev1informers "k8s.io/client-go/informers/core/v1" corev1listers "k8s.io/client-go/listers/core/v1" @@ -23,6 +19,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/linkmanager" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/egressip" ) // markIPs contains packet mark and associated EgressIP IP for IPv4 / IPv6. Key is packet mark, value egress IP @@ -126,6 +123,7 @@ func (mic *MarkIPsCache) deleteMarkIP(pktMark util.EgressIPMark, ip net.IP) { func (mic *MarkIPsCache) replaceAll(markIPs markIPs) { mic.mu.Lock() mic.markToIPs = markIPs + mic.IPToMark = make(map[string]int, len(markIPs.v4)+len(markIPs.v6)) for mark, ipv4 := range markIPs.v4 { mic.IPToMark[ipv4] = mark } @@ -451,7 +449,7 @@ func (g *BridgeEIPAddrManager) addIPBridge(ip net.IP) error { if err != nil { return fmt.Errorf("failed to get link obj by name %s: %v", g.bridgeName, err) } - return g.addrManager.AddAddress(getEIPBridgeNetlinkAddress(ip, link.Attrs().Index)) + return g.addrManager.AddAddress(*egressip.GetNetlinkAddress(ip, link.Attrs().Index)) } func (g *BridgeEIPAddrManager) deleteIPBridge(ip net.IP) error { @@ -459,7 +457,7 @@ func (g *BridgeEIPAddrManager) deleteIPBridge(ip net.IP) error { if err != nil { return fmt.Errorf("failed to get link obj by name %s: %v", g.bridgeName, err) } - return g.addrManager.DelAddress(getEIPBridgeNetlinkAddress(ip, link.Attrs().Index)) + return g.addrManager.DelAddress(*egressip.GetNetlinkAddress(ip, link.Attrs().Index)) } // getAnnotationIPs retrieves the egress IP annotation from the current node Nodes object. If multiple users, callers must synchronise. @@ -514,29 +512,3 @@ func getIPsStr(ips ...net.IP) []string { } return ipsStr } - -func getEIPBridgeNetlinkAddress(ip net.IP, ifindex int) netlink.Addr { - return netlink.Addr{ - IPNet: &net.IPNet{IP: ip, Mask: util.GetIPFullMask(ip)}, - Flags: getEIPNetlinkAddressFlag(ip), - Scope: int(netlink.SCOPE_UNIVERSE), - ValidLft: getEIPNetlinkAddressValidLft(ip), - LinkIndex: ifindex, - } -} - -func getEIPNetlinkAddressFlag(ip net.IP) int { - // isV6? - if ip.To4() == nil && ip.To16() != nil { - return unix.IFA_F_NODAD - } - return 0 -} - -func getEIPNetlinkAddressValidLft(ip net.IP) int { - // isV6? - if ip.To4() == nil && ip.To16() != nil { - return math.MaxUint32 - } - return 0 -} diff --git a/go-controller/pkg/node/egressip/gateway_egressip_test.go b/go-controller/pkg/node/egressip/gateway_egressip_test.go index 07a03a87b6..1fe48a6f5b 100644 --- a/go-controller/pkg/node/egressip/gateway_egressip_test.go +++ b/go-controller/pkg/node/egressip/gateway_egressip_test.go @@ -20,6 +20,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/linkmanager" netlink_mocks "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/mocks/github.com/vishvananda/netlink" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/egressip" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/mocks" ) @@ -63,7 +64,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eip := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -74,11 +75,11 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("doesn't configure or fail when annotation mark isn't found", func() { - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eip := getEIPAssignedToNode(nodeName, "", ipV4Addr) @@ -89,11 +90,11 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).ShouldNot(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertNotCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("fails when invalid annotation mark", func() { - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eip := getEIPAssignedToNode(nodeName, "not-an-integer", ipV4Addr) @@ -104,7 +105,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).ShouldNot(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertNotCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("configures annotations with existing entries", func() { @@ -113,7 +114,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, generateAnnotFromIPs(ipV4Addr2)) defer stopFn() eip := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -124,7 +125,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr, ipV4Addr2)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) }) @@ -135,7 +136,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() assignedEIP := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -147,7 +148,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("removes EgressIP previously assigned", func() { @@ -156,8 +157,8 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrDel", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrDel", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() assignedEIP := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -172,9 +173,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).ShouldNot(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("reconfigures from an old to a new IP", func() { @@ -183,9 +184,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrDel", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrDel", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() unassignedEIP := getEIPNotAssignedToNode(mark, ipV4Addr) @@ -201,11 +202,11 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr2)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) }) @@ -216,8 +217,8 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrDel", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrDel", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eip := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -231,9 +232,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).ShouldNot(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("does not update when EIP is deleted that wasn't assigned to the node", func() { @@ -247,7 +248,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr2)) gomega.Expect(nlMock.AssertNotCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) }) @@ -258,8 +259,8 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eipAssigned1 := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -271,9 +272,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr, ipV4Addr2)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("delete previous configuration", func() { @@ -282,9 +283,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrDel", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr3), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrDel", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr3), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, generateAnnotFromIPs(ipV4Addr3)) // previously configured IP defer stopFn() eipAssigned1 := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -295,11 +296,11 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr, ipV4Addr2)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr3), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr3), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("no update or failure when mark is not set", func() { @@ -387,11 +388,6 @@ func generateAnnotFromIPs(ips ...string) string { return fmt.Sprintf("[%s]", strings.Join(ipsWithQuotes, ",")) } -func getEIPBridgeNetlinkAddressPtr(ip net.IP, ifindex int) *netlink.Addr { - addr := getEIPBridgeNetlinkAddress(ip, ifindex) - return &addr -} - func parseEIPsFromAnnotation(node *corev1.Node) []string { ips, err := util.ParseNodeBridgeEgressIPsAnnotation(node) if err != nil { diff --git a/go-controller/pkg/node/gateway.go b/go-controller/pkg/node/gateway.go index 97e7baeecb..4a7416c64e 100644 --- a/go-controller/pkg/node/gateway.go +++ b/go-controller/pkg/node/gateway.go @@ -429,15 +429,17 @@ func gatewayInitInternal(nodeName, gwIntf, egressGatewayIntf string, gwNextHops } } - // Set static FDB entry for sharedGW MAC. - // If `GatewayIfaceRep` port is present, use it instead of LOCAL (bridge name). - gwport := gatewayBridge.GetBridgeName() // Default is LOCAL port for the bridge. - if repPort := gatewayBridge.GetGatewayIfaceRep(); repPort != "" { // We have an accelerated switchdev device for GW. - gwport = repPort - } + if config.OvnKubeNode.Mode != types.NodeModeDPUHost { + // Set static FDB entry for sharedGW MAC. + // If `GatewayIfaceRep` port is present, use it instead of LOCAL (bridge name). + gwport := gatewayBridge.GetBridgeName() // Default is LOCAL port for the bridge. + if repPort := gatewayBridge.GetGatewayIfaceRep(); repPort != "" { // We have an accelerated switchdev device for GW. + gwport = repPort + } - if err := util.SetStaticFDBEntry(gatewayBridge.GetBridgeName(), gwport, gatewayBridge.GetMAC()); err != nil { - return nil, nil, err + if err := util.SetStaticFDBEntry(gatewayBridge.GetBridgeName(), gwport, gatewayBridge.GetMAC()); err != nil { + return nil, nil, err + } } l3GwConfig := util.L3GatewayConfig{ @@ -466,26 +468,46 @@ func (g *gateway) GetGatewayBridgeIface() string { } func (g *gateway) GetGatewayIface() string { - return g.openflowManager.defaultBridge.GetGatewayIface() + if config.OvnKubeNode.Mode != types.NodeModeDPUHost { + if g.openflowManager == nil { + return "" + } + return g.openflowManager.defaultBridge.GetGatewayIface() + } else { + return config.Gateway.Interface + } } // SetDefaultGatewayBridgeMAC updates the mac address for the OFM used to render flows with func (g *gateway) SetDefaultGatewayBridgeMAC(macAddr net.HardwareAddr) { + if config.OvnKubeNode.Mode == types.NodeModeDPUHost { + return + } g.openflowManager.setDefaultBridgeMAC(macAddr) klog.Infof("Default gateway bridge MAC address updated to %s", macAddr) } func (g *gateway) SetDefaultPodNetworkAdvertised(isPodNetworkAdvertised bool) { + if config.OvnKubeNode.Mode == types.NodeModeDPUHost { + return + } g.openflowManager.defaultBridge.GetNetworkConfig(types.DefaultNetworkName).Advertised.Store(isPodNetworkAdvertised) } func (g *gateway) GetDefaultPodNetworkAdvertised() bool { + if config.OvnKubeNode.Mode == types.NodeModeDPUHost { + return false + } return g.openflowManager.defaultBridge.GetNetworkConfig(types.DefaultNetworkName).Advertised.Load() } // SetDefaultBridgeGARPDropFlows will enable flows to drop GARPs if the openflow // manager has been initialized. func (g *gateway) SetDefaultBridgeGARPDropFlows(isDropped bool) { + if config.OvnKubeNode.Mode == types.NodeModeDPUHost { + return + } + if g.openflowManager == nil { return } @@ -495,14 +517,22 @@ func (g *gateway) SetDefaultBridgeGARPDropFlows(isDropped bool) { // Reconcile handles triggering updates to different components of a gateway, like OFM, Services func (g *gateway) Reconcile() error { klog.Info("Reconciling gateway with updates") - if err := g.openflowManager.updateBridgeFlowCache(g.nodeIPManager.ListAddresses()); err != nil { - return err + if config.OvnKubeNode.Mode != types.NodeModeDPUHost { + if g.openflowManager != nil { + if err := g.openflowManager.updateBridgeFlowCache(g.nodeIPManager.ListAddresses()); err != nil { + return err + } + // let's sync these flows immediately + g.openflowManager.requestFlowSync() + } } - // let's sync these flows immediately - g.openflowManager.requestFlowSync() - err := g.updateSNATRules() - if err != nil { - return err + // TBD updateSNATRules() gets node host-cidr by accessing gateway.nodeIPManager, which does not + // exist in dpu-host mode. + if config.OvnKubeNode.Mode == types.NodeModeFull { + err := g.updateSNATRules() + if err != nil { + return err + } } // Services create OpenFlow flows as well, need to update them all if g.servicesRetryFramework != nil { diff --git a/go-controller/pkg/node/gateway_init.go b/go-controller/pkg/node/gateway_init.go index b4d11d69cf..659933d419 100644 --- a/go-controller/pkg/node/gateway_init.go +++ b/go-controller/pkg/node/gateway_init.go @@ -384,13 +384,9 @@ func interfaceForEXGW(intfName string) string { return intfName } -func (nc *DefaultNodeNetworkController) initGatewayDPUHost(kubeNodeIP net.IP, nodeAnnotator kube.Annotator) error { - // A DPU host gateway is complementary to the shared gateway running - // on the DPU embedded CPU. it performs some initializations and - // watch on services for iptable rule updates and run a loadBalancerHealth checker - // Note: all K8s Node related annotations are handled from DPU. - klog.Info("Initializing Shared Gateway Functionality on DPU host") - var err error +// TODO(adrianc): revisit if support for nodeIPManager is needed. +func (nc *DefaultNodeNetworkController) initGatewayDPUHostPreStart(kubeNodeIP net.IP, nodeAnnotator kube.Annotator) error { + klog.Info("Initializing Shared Gateway Functionality for Gateway PreStart on DPU host") // Find the network interface that has the Kubernetes node IP assigned to it // This interface will be used for DPU host gateway operations @@ -470,18 +466,43 @@ func (nc *DefaultNodeNetworkController) initGatewayDPUHost(kubeNodeIP net.IP, no return err } - gw := &gateway{ + if err = addHostMACBindings(kubeIntf); err != nil { + return fmt.Errorf("failed to add MAC bindings for service routing: %w", err) + } + + gatewayNextHops, _, err := getGatewayNextHops() + if err != nil { + return err + } + + nc.Gateway = &gateway{ initFunc: func() error { return nil }, readyFunc: func() (bool, error) { return true, nil }, watchFactory: nc.watchFactory.(*factory.WatchFactory), + nextHops: gatewayNextHops, } + return nil +} - // TODO(adrianc): revisit if support for nodeIPManager is needed. +func (nc *DefaultNodeNetworkController) initGatewayDPUHost() error { + // A DPU host gateway is complementary to the shared gateway running + // on the DPU embedded CPU. it performs some initializations and + // watch on services for iptable rule updates and run a loadBalancerHealth checker + // Note: all K8s Node related annotations are handled from DPU. + klog.Info("Initializing Shared Gateway Functionality for Gateway Start on DPU host") + var err error + // TODO(adrianc): revisit if support for nodeIPManager is needed. + gw := nc.Gateway.(*gateway) if config.Gateway.NodeportEnable { if err := initSharedGatewayIPTables(); err != nil { return err } + if util.IsNetworkSegmentationSupportEnabled() { + if err := configureUDNServicesNFTables(); err != nil { + return fmt.Errorf("unable to configure UDN nftables: %w", err) + } + } gw.nodePortWatcherIptables = newNodePortWatcherIptables(nc.networkManager) gw.loadBalancerHealthChecker = newLoadBalancerHealthChecker(nc.name, nc.watchFactory) portClaimWatcher, err := newPortClaimWatcher(nc.recorder) @@ -491,10 +512,6 @@ func (nc *DefaultNodeNetworkController) initGatewayDPUHost(kubeNodeIP net.IP, no gw.portClaimWatcher = portClaimWatcher } - if err := addHostMACBindings(kubeIntf); err != nil { - return fmt.Errorf("failed to add MAC bindings for service routing") - } - err = gw.Init(nc.stopChan, nc.wg) nc.Gateway = gw return err diff --git a/go-controller/pkg/node/gateway_init_linux_test.go b/go-controller/pkg/node/gateway_init_linux_test.go index 7e1f330937..b46657c5c2 100644 --- a/go-controller/pkg/node/gateway_init_linux_test.go +++ b/go-controller/pkg/node/gateway_init_linux_test.go @@ -925,7 +925,9 @@ func shareGatewayInterfaceDPUHostTest(app *cli.App, testNS ns.NetNS, uplinkName, nodeAnnotator := kube.NewNodeAnnotator(k, existingNode.Name) - err := nc.initGatewayDPUHost(net.ParseIP(hostIP), nodeAnnotator) + err := nc.initGatewayDPUHostPreStart(net.ParseIP(hostIP), nodeAnnotator) + Expect(err).NotTo(HaveOccurred()) + err = nc.initGatewayDPUHost() Expect(err).NotTo(HaveOccurred()) link, err := netlink.LinkByName(uplinkName) diff --git a/go-controller/pkg/node/gateway_shared_intf.go b/go-controller/pkg/node/gateway_shared_intf.go index 5d1d94cba6..773671b0ff 100644 --- a/go-controller/pkg/node/gateway_shared_intf.go +++ b/go-controller/pkg/node/gateway_shared_intf.go @@ -405,6 +405,11 @@ func (npw *nodePortWatcher) updateServiceFlowCache(service *corev1.Service, netI defaultNetConfig := npw.ofm.defaultBridge.GetActiveNetworkBridgeConfigCopy(types.DefaultNetworkName) var flows []string clusterIPs := util.GetClusterIPs(service) + outputActions := fmt.Sprintf("output:%s", defaultNetConfig.OfPortPatch) + if config.Gateway.VLANID != 0 { + outputActions = fmt.Sprintf("mod_vlan_vid:%d,%s", config.Gateway.VLANID, outputActions) + } + for _, clusterIP := range clusterIPs { ipPrefix := protoPrefixV4 if utilnet.IsIPv6String(clusterIP) { @@ -415,9 +420,9 @@ func (npw *nodePortWatcher) updateServiceFlowCache(service *corev1.Service, netI // ip,nw_dst=10.96.0.1 actions=mod_dl_dst:02:42:ac:12:00:03,output:"patch-breth0_ov" // This flow is used for UDNs and advertised UDNs to be able to reach kapi and dns services alone on default network flows = append(flows, fmt.Sprintf("cookie=%s, priority=300, table=2, %s, %s_dst=%s, "+ - "actions=set_field:%s->eth_dst,output:%s", + "actions=set_field:%s->eth_dst,%s", nodetypes.DefaultOpenFlowCookie, ipPrefix, ipPrefix, clusterIP, - npw.ofm.getDefaultBridgeMAC().String(), defaultNetConfig.OfPortPatch)) + npw.ofm.getDefaultBridgeMAC().String(), outputActions)) if util.IsRouteAdvertisementsEnabled() { // if the network is advertised, then for the reply from kapi and dns services to go back @@ -1584,8 +1589,7 @@ func newNodePortWatcher( // on the OVS bridge in the host. These flows act only on the packets coming in from outside // of the node. If someone on the node is trying to access the NodePort service, those packets // will not be processed by the OpenFlow flows, so we need to add iptable rules that DNATs the - // NodePortIP:NodePort to ClusterServiceIP:Port. We don't need to do this while - // running on DPU or on DPU-Host. + // NodePortIP:NodePort to ClusterServiceIP:Port. We don't need to do this on DPU. if config.OvnKubeNode.Mode == types.NodeModeFull { if config.Gateway.Mode == config.GatewayModeLocal { if err := initLocalGatewayIPTables(); err != nil { diff --git a/go-controller/pkg/node/gateway_udn.go b/go-controller/pkg/node/gateway_udn.go index 9d3d0a75ac..a284dca0cd 100644 --- a/go-controller/pkg/node/gateway_udn.go +++ b/go-controller/pkg/node/gateway_udn.go @@ -125,13 +125,17 @@ func NewUserDefinedNetworkGateway(netInfo util.NetInfo, node *corev1.Node, nodeL return nil, fmt.Errorf("unable to dereference default node network controller gateway object") } - if gw.openflowManager == nil { + if config.OvnKubeNode.Mode != types.NodeModeDPUHost && gw.openflowManager == nil { return nil, fmt.Errorf("openflow manager has not been provided for network: %s", netInfo.GetNetworkName()) } - intfName := gw.openflowManager.defaultBridge.GetGatewayIface() + + intfName := gw.GetGatewayIface() + if intfName == "" { + return nil, fmt.Errorf("failed to get gateway interface for network: %s", netInfo.GetNetworkName()) + } link, err := util.GetNetLinkOps().LinkByName(intfName) if err != nil { - return nil, fmt.Errorf("unable to get link for %s, error: %v", intfName, err) + return nil, fmt.Errorf("unable to get link for gateway interface %s, error: %v", intfName, err) } return &UserDefinedNetworkGateway{ @@ -204,7 +208,7 @@ func (udng *UserDefinedNetworkGateway) addMarkChain() error { // AddNetwork will be responsible to create all plumbings // required by this UDN on the gateway side func (udng *UserDefinedNetworkGateway) AddNetwork() error { - if udng.openflowManager == nil { + if config.OvnKubeNode.Mode != types.NodeModeDPUHost && udng.openflowManager == nil { return fmt.Errorf("openflow manager has not been provided for network: %s", udng.NetInfo.GetNetworkName()) } // port is created first and its MAC address configured. The IP(s) on that link are added after enslaving to a VRF device (addUDNManagementPortIPs) @@ -214,81 +218,94 @@ func (udng *UserDefinedNetworkGateway) AddNetwork() error { if err != nil { return fmt.Errorf("could not create management port netdevice for network %s: %w", udng.GetNetworkName(), err) } - vrfDeviceName := util.GetNetworkVRFName(udng.NetInfo) - routes, err := udng.computeRoutesForUDN(mplink) - if err != nil { - return fmt.Errorf("failed to compute routes for network %s, err: %v", udng.GetNetworkName(), err) - } - if err = udng.vrfManager.AddVRF(vrfDeviceName, mplink.Attrs().Name, uint32(udng.vrfTableId), nil); err != nil { - return fmt.Errorf("could not add VRF %d for network %s, err: %v", udng.vrfTableId, udng.GetNetworkName(), err) - } - if err = udng.addUDNManagementPortIPs(mplink); err != nil { - return fmt.Errorf("unable to add management port IP(s) for link %s, for network %s: %w", mplink.Attrs().Name, udng.GetNetworkName(), err) - } - if err = udng.vrfManager.AddVRFRoutes(vrfDeviceName, routes); err != nil { - return fmt.Errorf("could not add VRF %s routes for network %s, err: %v", vrfDeviceName, udng.GetNetworkName(), err) + + if config.OvnKubeNode.Mode != types.NodeModeDPU { + vrfDeviceName := util.GetNetworkVRFName(udng.NetInfo) + routes, err := udng.computeRoutesForUDN(mplink) + if err != nil { + return fmt.Errorf("failed to compute routes for network %s, err: %v", udng.GetNetworkName(), err) + } + if err = udng.vrfManager.AddVRF(vrfDeviceName, mplink.Attrs().Name, uint32(udng.vrfTableId), nil); err != nil { + return fmt.Errorf("could not add VRF %d for network %s, err: %v", udng.vrfTableId, udng.GetNetworkName(), err) + } + if err = udng.addUDNManagementPortIPs(mplink); err != nil { + return fmt.Errorf("unable to add management port IP(s) for link %s, for network %s: %w", mplink.Attrs().Name, udng.GetNetworkName(), err) + } + if err = udng.vrfManager.AddVRFRoutes(vrfDeviceName, routes); err != nil { + return fmt.Errorf("could not add VRF %s routes for network %s, err: %v", vrfDeviceName, udng.GetNetworkName(), err) + } } udng.updateAdvertisementStatus() - // create the iprules for this network - if err = udng.updateUDNVRFIPRules(); err != nil { - return fmt.Errorf("failed to update IP rules for network %s: %w", udng.GetNetworkName(), err) - } - - if err = udng.updateAdvertisedUDNIsolationRules(); err != nil { - return fmt.Errorf("failed to update isolation rules for network %s: %w", udng.GetNetworkName(), err) - } + if config.OvnKubeNode.Mode != types.NodeModeDPU { + // create the iprules for this network + if err = udng.updateUDNVRFIPRules(); err != nil { + return fmt.Errorf("failed to update IP rules for network %s: %w", udng.GetNetworkName(), err) + } - if err := udng.updateUDNVRFIPRoute(); err != nil { - return fmt.Errorf("failed to update ip routes for network %s: %w", udng.GetNetworkName(), err) - } + if err = udng.updateAdvertisedUDNIsolationRules(); err != nil { + return fmt.Errorf("failed to update isolation rules for network %s: %w", udng.GetNetworkName(), err) + } - // add loose mode for rp filter on management port - mgmtPortName := util.GetNetworkScopedK8sMgmtHostIntfName(uint(udng.GetNetworkID())) - if err := addRPFilterLooseModeForManagementPort(mgmtPortName); err != nil { - return fmt.Errorf("could not set loose mode for reverse path filtering on management port %s: %v", mgmtPortName, err) - } + if err := udng.updateUDNVRFIPRoute(); err != nil { + return fmt.Errorf("failed to update ip routes for network %s: %w", udng.GetNetworkName(), err) + } - nodeSubnets, err := udng.getLocalSubnets() - var mgmtIPs []*net.IPNet - for _, subnet := range nodeSubnets { - mgmtIPs = append(mgmtIPs, udng.GetNodeManagementIP(subnet)) - } - if err != nil { - return fmt.Errorf("failed to get node subnets for network %s: %w", udng.GetNetworkName(), err) - } - if err = udng.openflowManager.addNetwork(udng.NetInfo, nodeSubnets, mgmtIPs, udng.masqCTMark, udng.pktMark, udng.v6MasqIPs, udng.v4MasqIPs); err != nil { - return fmt.Errorf("could not add network %s: %v", udng.GetNetworkName(), err) + // add loose mode for rp filter on management port + mgmtPortName := util.GetNetworkScopedK8sMgmtHostIntfName(uint(udng.GetNetworkID())) + if err := addRPFilterLooseModeForManagementPort(mgmtPortName); err != nil { + return fmt.Errorf("could not set loose mode for reverse path filtering on management port %s: %v", mgmtPortName, err) + } } - waiter := newStartupWaiterWithTimeout(waitForPatchPortTimeout) - readyFunc := func() (bool, error) { - if err := udng.openflowManager.defaultBridge.SetNetworkOfPatchPort(udng.GetNetworkName()); err != nil { - klog.V(3).Infof("Failed to set network %s's openflow ports for default bridge; error: %v", udng.GetNetworkName(), err) - return false, nil + if config.OvnKubeNode.Mode != types.NodeModeDPUHost { + nodeSubnets, err := udng.getLocalSubnets() + if err != nil { + return fmt.Errorf("failed to get node subnets for network %s: %w", udng.GetNetworkName(), err) } - if udng.openflowManager.externalGatewayBridge != nil { - if err := udng.openflowManager.externalGatewayBridge.SetNetworkOfPatchPort(udng.GetNetworkName()); err != nil { - klog.V(3).Infof("Failed to set network %s's openflow ports for secondary bridge; error: %v", udng.GetNetworkName(), err) + var mgmtIPs []*net.IPNet + for _, subnet := range nodeSubnets { + mgmtIPs = append(mgmtIPs, udng.GetNodeManagementIP(subnet)) + } + if err = udng.openflowManager.addNetwork(udng.NetInfo, nodeSubnets, mgmtIPs, udng.masqCTMark, udng.pktMark, udng.v6MasqIPs, udng.v4MasqIPs); err != nil { + return fmt.Errorf("could not add network %s: %v", udng.GetNetworkName(), err) + } + + waiter := newStartupWaiterWithTimeout(waitForPatchPortTimeout) + readyFunc := func() (bool, error) { + if err := udng.openflowManager.defaultBridge.SetNetworkOfPatchPort(udng.GetNetworkName()); err != nil { + klog.V(3).Infof("Failed to set network %s's openflow ports for default bridge; error: %v", udng.GetNetworkName(), err) return false, nil } + if udng.openflowManager.externalGatewayBridge != nil { + if err := udng.openflowManager.externalGatewayBridge.SetNetworkOfPatchPort(udng.GetNetworkName()); err != nil { + klog.V(3).Infof("Failed to set network %s's openflow ports for secondary bridge; error: %v", udng.GetNetworkName(), err) + return false, nil + } + } + return true, nil } - return true, nil - } - postFunc := func() error { + postFunc := func() error { + if err := udng.gateway.Reconcile(); err != nil { + return fmt.Errorf("failed to reconcile flows on bridge for network %s; error: %v", udng.GetNetworkName(), err) + } + return nil + } + waiter.AddWait(readyFunc, postFunc) + if err := waiter.Wait(); err != nil { + return err + } + } else { if err := udng.gateway.Reconcile(); err != nil { return fmt.Errorf("failed to reconcile flows on bridge for network %s; error: %v", udng.GetNetworkName(), err) } - return nil - } - waiter.AddWait(readyFunc, postFunc) - if err := waiter.Wait(); err != nil { - return err } - if err := udng.addMarkChain(); err != nil { - return fmt.Errorf("failed to add the service masquerade chain: %w", err) + if config.OvnKubeNode.Mode != types.NodeModeDPU { + if err := udng.addMarkChain(); err != nil { + return fmt.Errorf("failed to add the service masquerade chain: %w", err) + } } // run gateway reconciliation loop on network configuration changes @@ -305,33 +322,41 @@ func (udng *UserDefinedNetworkGateway) GetNetworkRuleMetadata() string { // the gateway side. It's considered invalid to call this instance after // DelNetwork has returned succesfully. func (udng *UserDefinedNetworkGateway) DelNetwork() error { - vrfDeviceName := util.GetNetworkVRFName(udng.NetInfo) - // delete the iprules for this network - if err := udng.ruleManager.DeleteWithMetadata(udng.GetNetworkRuleMetadata()); err != nil { - return fmt.Errorf("unable to delete iprules for network %s, err: %v", udng.GetNetworkName(), err) + if config.OvnKubeNode.Mode != types.NodeModeDPU { + vrfDeviceName := util.GetNetworkVRFName(udng.NetInfo) + // delete the iprules for this network + if err := udng.ruleManager.DeleteWithMetadata(udng.GetNetworkRuleMetadata()); err != nil { + return fmt.Errorf("unable to delete iprules for network %s, err: %v", udng.GetNetworkName(), err) + } + // delete the VRF device for this network + if err := udng.vrfManager.DeleteVRF(vrfDeviceName); err != nil { + return err + } } - // delete the VRF device for this network - if err := udng.vrfManager.DeleteVRF(vrfDeviceName); err != nil { - return err + if config.OvnKubeNode.Mode != types.NodeModeDPUHost { + // delete the openflows for this network + if udng.openflowManager != nil { + udng.openflowManager.delNetwork(udng.NetInfo) + } } - // delete the openflows for this network - if udng.openflowManager != nil { - udng.openflowManager.delNetwork(udng.NetInfo) + if udng.openflowManager != nil || config.OvnKubeNode.Mode == types.NodeModeDPUHost { if err := udng.gateway.Reconcile(); err != nil { return fmt.Errorf("failed to reconcile default gateway for network %s, err: %v", udng.GetNetworkName(), err) } } - err := udng.deleteAdvertisedUDNIsolationRules() - if err != nil { - return fmt.Errorf("failed to remove advertised UDN isolation rules for network %s: %w", udng.GetNetworkName(), err) - } + if config.OvnKubeNode.Mode != types.NodeModeDPU { + err := udng.deleteAdvertisedUDNIsolationRules() + if err != nil { + return fmt.Errorf("failed to remove advertised UDN isolation rules for network %s: %w", udng.GetNetworkName(), err) + } - if err := udng.delMarkChain(); err != nil { - return err + if err := udng.delMarkChain(); err != nil { + return err + } } // delete the management port interface for this network - err = udng.deleteUDNManagementPort() + err := udng.deleteUDNManagementPort() if err != nil { return err } @@ -795,40 +820,50 @@ func (udng *UserDefinedNetworkGateway) Reconcile() { func (udng *UserDefinedNetworkGateway) doReconcile() error { klog.Infof("Reconciling gateway with updates for UDN %s", udng.GetNetworkName()) - // shouldn't happen - if udng.openflowManager == nil || udng.openflowManager.defaultBridge == nil { - return fmt.Errorf("openflow manager with default bridge configuration has not been provided for network %s", udng.GetNetworkName()) + if config.OvnKubeNode.Mode != types.NodeModeDPUHost { + // shouldn't happen + if udng.openflowManager == nil || udng.openflowManager.defaultBridge == nil { + return fmt.Errorf("openflow manager with default bridge configuration has not been provided for network %s", udng.GetNetworkName()) + } } udng.updateAdvertisementStatus() - // update bridge configuration - netConfig := udng.openflowManager.defaultBridge.GetNetworkConfig(udng.GetNetworkName()) - if netConfig == nil { - return fmt.Errorf("missing bridge configuration for network %s", udng.GetNetworkName()) + if config.OvnKubeNode.Mode != types.NodeModeDPUHost { + // update bridge configuration + netConfig := udng.openflowManager.defaultBridge.GetNetworkConfig(udng.GetNetworkName()) + if netConfig == nil { + return fmt.Errorf("missing bridge configuration for network %s", udng.GetNetworkName()) + } + netConfig.Advertised.Store(udng.isNetworkAdvertised) } - netConfig.Advertised.Store(udng.isNetworkAdvertised) - if err := udng.updateUDNVRFIPRules(); err != nil { - return fmt.Errorf("error while updating ip rule for UDN %s: %s", udng.GetNetworkName(), err) - } + if config.OvnKubeNode.Mode != types.NodeModeDPU { + if err := udng.updateUDNVRFIPRules(); err != nil { + return fmt.Errorf("error while updating ip rule for UDN %s: %s", udng.GetNetworkName(), err) + } - if err := udng.updateUDNVRFIPRoute(); err != nil { - return fmt.Errorf("error while updating ip route for UDN %s: %s", udng.GetNetworkName(), err) + if err := udng.updateUDNVRFIPRoute(); err != nil { + return fmt.Errorf("error while updating ip route for UDN %s: %s", udng.GetNetworkName(), err) + } } - // add below OpenFlows based on the gateway mode and whether the network is advertised or not: - // table=1, n_packets=0, n_bytes=0, priority=16,ip,nw_dst=128.192.0.2 actions=LOCAL (Both gateway modes) - // table=1, n_packets=0, n_bytes=0, priority=15,ip,nw_dst=128.192.0.0/14 actions=output:3 (shared gateway mode) - // necessary service isolation flows based on whether network is advertised or not - if err := udng.openflowManager.updateBridgeFlowCache(udng.nodeIPManager.ListAddresses()); err != nil { - return fmt.Errorf("error while updating logical flow for UDN %s: %s", udng.GetNetworkName(), err) + if config.OvnKubeNode.Mode != types.NodeModeDPUHost { + // add below OpenFlows based on the gateway mode and whether the network is advertised or not: + // table=1, n_packets=0, n_bytes=0, priority=16,ip,nw_dst=128.192.0.2 actions=LOCAL (Both gateway modes) + // table=1, n_packets=0, n_bytes=0, priority=15,ip,nw_dst=128.192.0.0/14 actions=output:3 (shared gateway mode) + // necessary service isolation flows based on whether network is advertised or not + if err := udng.openflowManager.updateBridgeFlowCache(udng.nodeIPManager.ListAddresses()); err != nil { + return fmt.Errorf("error while updating logical flow for UDN %s: %s", udng.GetNetworkName(), err) + } + // let's sync these flows immediately + udng.openflowManager.requestFlowSync() } - // let's sync these flows immediately - udng.openflowManager.requestFlowSync() - if err := udng.updateAdvertisedUDNIsolationRules(); err != nil { - return fmt.Errorf("error while updating advertised UDN isolation rules for network %s: %w", udng.GetNetworkName(), err) + if config.OvnKubeNode.Mode != types.NodeModeDPU { + if err := udng.updateAdvertisedUDNIsolationRules(); err != nil { + return fmt.Errorf("error while updating advertised UDN isolation rules for network %s: %w", udng.GetNetworkName(), err) + } } return nil } diff --git a/go-controller/pkg/ovn/base_network_controller_pods.go b/go-controller/pkg/ovn/base_network_controller_pods.go index e3abb1550c..91c092c208 100644 --- a/go-controller/pkg/ovn/base_network_controller_pods.go +++ b/go-controller/pkg/ovn/base_network_controller_pods.go @@ -1032,22 +1032,17 @@ func (bnc *BaseNetworkController) allocatesPodAnnotation() bool { func (bnc *BaseNetworkController) shouldReleaseDeletedPod(pod *corev1.Pod, switchName, nad string, podIfAddrs []*net.IPNet) (bool, error) { var err error - var isMigratedSourcePodStale bool - if !bnc.IsUserDefinedNetwork() { - isMigratedSourcePodStale, err = kubevirt.IsMigratedSourcePodStale(bnc.watchFactory, pod) + if !bnc.IsUserDefinedNetwork() && kubevirt.IsPodLiveMigratable(pod) { + allVMPodsAreCompleted, err := kubevirt.AllVMPodsAreCompleted(bnc.watchFactory, pod) if err != nil { return false, err } - } - - // Removing the the kubevirt stale pods should not de allocate the IPs - // to ensure that new pods do not take them - if isMigratedSourcePodStale { - return false, nil - } - if !util.PodCompleted(pod) { - return true, nil + // Removing the the kubevirt stale pods should not de allocate the IPs + // to ensure that new pods do not take them + if !allVMPodsAreCompleted { + return false, nil + } } if bnc.wasPodReleasedBeforeStartup(string(pod.UID), nad) { @@ -1059,16 +1054,21 @@ func (bnc *BaseNetworkController) shouldReleaseDeletedPod(pod *corev1.Pod, switc return false, nil } - shouldReleasePodIPs := func() (bool, error) { - // If this pod applies to live migration it could have migrated so get the - // correct node name corresponding with the subnet. If the subnet is not - // tracked within the zone, nodeName will be empty which will force - // canReleasePodIPs to lookup all nodes. - nodeName := pod.Spec.NodeName - if !bnc.IsUserDefinedNetwork() && kubevirt.IsPodLiveMigratable(pod) { - nodeName, _ = bnc.lsManager.GetSubnetName(podIfAddrs) - } + // If this pod applies to live migration it could have migrated within the + // zone so get the correct node name corresponding with the subnet. If the + // subnet is not tracked within the zone, nodeName will be empty which + // will force a lookup for all nodes. + zoneOwnsSubnet := true + nodeName := pod.Spec.NodeName + if !bnc.IsUserDefinedNetwork() && kubevirt.IsPodLiveMigratable(pod) { + switchName, zoneOwnsSubnet = bnc.lsManager.GetSubnetName(podIfAddrs) + } + if !zoneOwnsSubnet { + // force shouldReleasePodIPs to search all nodes + nodeName = "" + } + shouldReleasePodIPs := func() (bool, error) { shouldRelease, err := bnc.canReleasePodIPs(podIfAddrs, nodeName) if err != nil { return false, err @@ -1082,7 +1082,7 @@ func (bnc *BaseNetworkController) shouldReleaseDeletedPod(pod *corev1.Pod, switc // if other pods are using the same IPs just in case we are processing // events in different order than cluster manager did (best effort, there // can still be issues with this) - if !bnc.allocatesPodAnnotation() { + if !bnc.allocatesPodAnnotation() || !zoneOwnsSubnet { shouldRelease, err = shouldReleasePodIPs() } else { shouldRelease, err = bnc.lsManager.ConditionalIPRelease(switchName, podIfAddrs, shouldReleasePodIPs) diff --git a/go-controller/pkg/ovn/kubevirt_test.go b/go-controller/pkg/ovn/kubevirt_test.go index 061b9fc5f6..c311ad657b 100644 --- a/go-controller/pkg/ovn/kubevirt_test.go +++ b/go-controller/pkg/ovn/kubevirt_test.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "net" + "slices" "strings" "time" @@ -20,6 +21,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kubevirt" libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/libovsdb" ovntypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" @@ -41,6 +43,8 @@ var _ = Describe("OVN Kubevirt Operations", func() { dnsServiceIPv6 = "fd7b:6b4d:7b25:d22f::3" clusterCIDRIPv4 = "10.128.0.0/16" clusterCIDRIPv6 = "fe00::/64" + subnetSuffixIPv4 = "/24" + subnetSuffixIPv6 = "/64" ) type testDHCPOptions struct { cidr string @@ -104,6 +108,8 @@ var _ = Describe("OVN Kubevirt Operations", func() { subnetIPv6 string transitSwitchPortIPv4 string transitSwitchPortIPv6 string + addressIPv4 string + addressIPv6 string } type testVM struct { @@ -124,6 +130,8 @@ var _ = Describe("OVN Kubevirt Operations", func() { lrpNetworkIPv6: "fd98::4/64", transitSwitchPortIPv4: "100.65.0.4/24", transitSwitchPortIPv6: "fd13::4/64", + addressIPv4: "10.89.0.1/24", + addressIPv6: "fc00:f853:ccd:e793::1/64", }, node2: { nodeID: "5", @@ -133,6 +141,8 @@ var _ = Describe("OVN Kubevirt Operations", func() { lrpNetworkIPv6: "fd98::5/64", transitSwitchPortIPv4: "100.65.0.5/24", transitSwitchPortIPv6: "fd13::5/64", + addressIPv4: "10.89.0.2/24", + addressIPv6: "fc00:f853:ccd:e793::2/64", }, node3: { nodeID: "6", @@ -142,6 +152,8 @@ var _ = Describe("OVN Kubevirt Operations", func() { lrpNetworkIPv6: "fd98::6/64", transitSwitchPortIPv4: "100.65.0.6/24", transitSwitchPortIPv6: "fd13::6/64", + addressIPv4: "10.89.0.3/24", + addressIPv6: "fc00:f853:ccd:e793::3/64", }, } vmByName = map[string]testVM{ @@ -245,6 +257,27 @@ var _ = Describe("OVN Kubevirt Operations", func() { return append(previousData, data...) } + filterOutStaleVirtLauncherExpectedTestData = func(namespace, name string, previousData []libovsdb.TestData) []libovsdb.TestData { + var data []libovsdb.TestData + lspUUID := util.GetLogicalPortName(namespace, name) + "-UUID" + for _, d := range previousData { + switch model := d.(type) { + case *nbdb.LogicalSwitch: + lsp := *model + lsp.Ports = slices.Clone(lsp.Ports) + lsp.Ports = slices.DeleteFunc(lsp.Ports, func(port string) bool { return port == lspUUID }) + d = &lsp + case *nbdb.LogicalSwitchPort: + if model.UUID == lspUUID { + continue + } + } + data = append(data, d) + + } + return data + } + newPodFromTestVirtLauncherPod = func(t testVirtLauncherPod) *corev1.Pod { if t.podName == "" { return nil @@ -342,8 +375,35 @@ var _ = Describe("OVN Kubevirt Operations", func() { ExternalIDs: ids, } } + composeNats = func(pod testVirtLauncherPod) ([]string, []*nbdb.NAT) { + var ids []string + var nats []*nbdb.NAT + if config.IPv4Mode { + id := pod.podName + "-IPv4-NAD-UUID" + nats = append(nats, &nbdb.NAT{ + UUID: id, + LogicalIP: pod.addressIPv4, + ExternalIP: testing.MustParseIPNet(nodeByName[pod.nodeName].addressIPv4).IP.String(), + Options: map[string]string{"stateless": "false"}, + Type: "snat", + }) + ids = append(ids, id) + } + if config.IPv6Mode { + id := pod.podName + "-IPv6-NAD-UUID" + nats = append(nats, &nbdb.NAT{ + UUID: id, + LogicalIP: pod.addressIPv6, + ExternalIP: testing.MustParseIPNet(nodeByName[pod.nodeName].addressIPv6).IP.String(), + Options: map[string]string{"stateless": "false"}, + Type: "snat", + }) + ids = append(ids, id) + } + return ids, nats + } - expectedNBDBAfterCleanup = func(expectedStaticRoutes []*nbdb.LogicalRouterStaticRoute) []libovsdb.TestData { + expectedNBDBAfterCleanup = func(expectedStaticRoutes []*nbdb.LogicalRouterStaticRoute, expectedNATs map[string][]*nbdb.NAT) []libovsdb.TestData { data := []libovsdb.TestData{} expectedPoliciesAfterCleanup := []string{} expectedStaticRoutesAfterCleanup := []string{} @@ -366,6 +426,11 @@ var _ = Describe("OVN Kubevirt Operations", func() { continue } else if lr, ok := nbData.(*nbdb.LogicalRouter); ok && lr.Name == ovntypes.OVNClusterRouter { expectedOvnClusterRouterAfterCleanup = lr + } else if lr, ok := nbData.(*nbdb.LogicalRouter); ok && expectedNATs[lr.Name] != nil { + for _, nat := range expectedNATs[lr.Name] { + lr.Nat = append(lr.Nat, nat.UUID) + data = append(data, nat) + } } data = append(data, nbData) } @@ -487,6 +552,7 @@ var _ = Describe("OVN Kubevirt Operations", func() { // To skip port group not found error config.EnableMulticast = false + config.Gateway.DisableSNATMultipleGWs = true fakeOvn = NewFakeOVN(true) }) @@ -671,6 +737,8 @@ var _ = Describe("OVN Kubevirt Operations", func() { Annotations: map[string]string{ "k8s.ovn.org/node-transit-switch-port-ifaddr": fmt.Sprintf(`{"ipv4": %q, "ipv6": %q}`, nodeByName[node1].transitSwitchPortIPv4, nodeByName[node1].transitSwitchPortIPv6), "k8s.ovn.org/node-subnets": fmt.Sprintf(`{"default":[%q,%q]}`, nodeByName[node1].subnetIPv4, nodeByName[node1].subnetIPv6), + "k8s.ovn.org/l3-gateway-config": fmt.Sprintf(`{"default": {"mode": "local", "mac-address":"7e:57:f8:f0:3c:51", "ip-addresses":[%q, %q]}}`, nodeByName[node1].addressIPv4, nodeByName[node1].addressIPv6), + "k8s.ovn.org/node-chassis-id": "1", util.OvnNodeID: nodeByName[node1].nodeID, }, }, @@ -681,6 +749,8 @@ var _ = Describe("OVN Kubevirt Operations", func() { Annotations: map[string]string{ "k8s.ovn.org/node-transit-switch-port-ifaddr": fmt.Sprintf(`{"ipv4": %q, "ipv6": %q}`, nodeByName[node2].transitSwitchPortIPv4, nodeByName[node2].transitSwitchPortIPv6), "k8s.ovn.org/node-subnets": fmt.Sprintf(`{"default":[%q,%q]}`, nodeByName[node2].subnetIPv4, nodeByName[node2].subnetIPv6), + "k8s.ovn.org/l3-gateway-config": fmt.Sprintf(`{"default": {"mode": "local", "mac-address":"7e:57:f8:f0:3c:52", "ip-addresses":[%q, %q]}}`, nodeByName[node2].addressIPv4, nodeByName[node2].addressIPv6), + "k8s.ovn.org/node-chassis-id": "2", util.OvnNodeID: nodeByName[node2].nodeID, }, }, @@ -691,6 +761,8 @@ var _ = Describe("OVN Kubevirt Operations", func() { Annotations: map[string]string{ "k8s.ovn.org/node-transit-switch-port-ifaddr": fmt.Sprintf(`{"ipv4": %q, "ipv6": %q}`, nodeByName[node3].transitSwitchPortIPv4, nodeByName[node3].transitSwitchPortIPv6), "k8s.ovn.org/node-subnets": fmt.Sprintf(`{"default":[%q,%q]}`, nodeByName[node3].subnetIPv4, nodeByName[node3].subnetIPv6), + "k8s.ovn.org/l3-gateway-config": fmt.Sprintf(`{"default": {"mode": "local", "mac-address":"7e:57:f8:f0:3c:53", "ip-addresses":[%q, %q]}}`, nodeByName[node3].addressIPv4, nodeByName[node3].addressIPv6), + "k8s.ovn.org/node-chassis-id": "3", util.OvnNodeID: nodeByName[node3].nodeID, }, }, @@ -749,6 +821,7 @@ var _ = Describe("OVN Kubevirt Operations", func() { } expectedOVN := []libovsdb.TestData{} + ovnClusterRouter.Policies = []string{} expectedOVNClusterRouter := ovnClusterRouter.DeepCopy() expectedOVNClusterRouter.Policies = []string{} @@ -781,23 +854,38 @@ var _ = Describe("OVN Kubevirt Operations", func() { expectedOVN = append(expectedOVN, ComposeDHCPv6Options(dhcpv6OptionsUUID+d.hostname, t.namespace, &d)) } expectedSourceLSRP := migrationSourceLSRP.DeepCopy() + expectedGWRouter := gwRouter.DeepCopy() expectedOVN = append(expectedOVN, expectedOVNClusterRouter, - gwRouter, + expectedGWRouter, logicalRouterPort, expectedSourceLSRP, ) expectedOVN = kubevirtOVNTestData(t, expectedOVN) + var expectedMigrationTargetGWRouter *nbdb.LogicalRouter if t.migrationTarget.nodeName != "" { expectedTargetLSRP := migrationTargetLSRP.DeepCopy() + expectedMigrationTargetGWRouter = migrationTargetGWRouter.DeepCopy() expectedOVN = append(expectedOVN, migrationTargetLRP, expectedTargetLSRP, - migrationTargetGWRouter, + expectedMigrationTargetGWRouter, ) } + + for router, testpod := range map[*nbdb.LogicalRouter]testVirtLauncherPod{expectedGWRouter: t.testVirtLauncherPod, expectedMigrationTargetGWRouter: t.migrationTarget.testVirtLauncherPod} { + if _, isLocal := fakeOvn.controller.localZoneNodes.Load(testpod.nodeName); isLocal && router != nil && testpod.podName != "" { + natIDs, nats := composeNats(testpod) + router.Nat = append(router.Nat, natIDs...) + for _, nat := range nats { + expectedOVN = append(expectedOVN, nat) + } + } + } + Eventually(fakeOvn.nbClient).Should(libovsdb.HaveData(expectedOVN), "should populate ovn") + if t.replaceNode != "" { By("Replace vm node with newNode at the logical switch manager") newNode := &corev1.Node{ @@ -830,26 +918,87 @@ var _ = Describe("OVN Kubevirt Operations", func() { Expect(err).ToNot(HaveOccurred()) } - if t.podName != "" { - pod, err := fakeOvn.fakeClient.KubeClient.CoreV1().Pods(t.namespace).Get(context.TODO(), t.podName, metav1.GetOptions{}) + var vmIPs []string + if t.addressIPv4 != "" { + vmIPs = append(vmIPs, t.addressIPv4+subnetSuffixIPv4) + } + if t.addressIPv6 != "" { + vmIPs = append(vmIPs, t.addressIPv6+subnetSuffixIPv6) + } + vmIPNets := testing.MustParseIPNets(vmIPs...) + subnet, checkRelease := fakeOvn.controller.lsManager.GetSubnetName(vmIPNets) + + if t.podName == "" { + return nil + } + + completeAndDeletePod := func(namespace, name string) { + GinkgoHelper() + pod, err := fakeOvn.fakeClient.KubeClient.CoreV1().Pods(namespace).Get(context.TODO(), name, metav1.GetOptions{}) Expect(err).NotTo(HaveOccurred()) pod.Status.Phase = corev1.PodSucceeded - _, err = fakeOvn.fakeClient.KubeClient.CoreV1().Pods(t.namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}) + _, err = fakeOvn.fakeClient.KubeClient.CoreV1().Pods(namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}) Expect(err).NotTo(HaveOccurred()) - err = fakeOvn.fakeClient.KubeClient.CoreV1().Pods(t.namespace).Delete(context.TODO(), t.podName, metav1.DeleteOptions{}) + err = fakeOvn.fakeClient.KubeClient.CoreV1().Pods(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) Expect(err).NotTo(HaveOccurred()) + } - if t.migrationTarget.nodeName != "" { - pod, err := fakeOvn.fakeClient.KubeClient.CoreV1().Pods(t.namespace).Get(context.TODO(), t.migrationTarget.podName, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred()) - pod.Status.Phase = corev1.PodSucceeded - _, err = fakeOvn.fakeClient.KubeClient.CoreV1().Pods(t.namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}) - Expect(err).NotTo(HaveOccurred()) - err = fakeOvn.fakeClient.KubeClient.CoreV1().Pods(t.namespace).Delete(context.TODO(), t.migrationTarget.podName, metav1.DeleteOptions{}) - Expect(err).NotTo(HaveOccurred()) + deleteFirst := t.testVirtLauncherPod + deleteSecond := t.migrationTarget.testVirtLauncherPod + deleteFirstRouter := gwRouter + hasMigration := t.migrationTarget.nodeName != "" + hasUnsuccesfulMigration := hasMigration && t.migrationTarget.updatePhase != nil + if hasUnsuccesfulMigration { + deleteFirst = t.migrationTarget.testVirtLauncherPod + deleteFirstRouter = migrationTargetGWRouter + deleteSecond = t.testVirtLauncherPod + } + + completeAndDeletePod(t.namespace, deleteFirst.podName) + if !hasMigration { + if checkRelease { + Eventually(fakeOvn.controller.lsManager.AllocateIPs). + WithArguments(subnet, vmIPNets). + Should(Succeed(), "should have de-allocated VM IP after termination") } - Eventually(fakeOvn.nbClient).Should(libovsdb.HaveData(expectedNBDBAfterCleanup(expectedStaticRoutes)), "should cleanup ovn") + Eventually(fakeOvn.nbClient).Should( + libovsdb.HaveData(expectedNBDBAfterCleanup(expectedStaticRoutes, nil)), + "should cleanup terminated pod data from ovn", + ) + return nil } + if checkRelease { + Consistently(fakeOvn.controller.lsManager.AllocateIPs). + WithArguments(subnet, vmIPNets). + ShouldNot(Succeed(), "should have not de-allocated VM IP after migration") + } + + Eventually(fakeOvn.nbClient).Should( + libovsdb.HaveData(filterOutStaleVirtLauncherExpectedTestData(t.namespace, deleteFirst.podName, expectedOVN)), + "should cleanup source pod data from ovn", + ) + + completeAndDeletePod(t.namespace, deleteSecond.podName) + if checkRelease { + Eventually(fakeOvn.controller.lsManager.AllocateIPs). + WithArguments(subnet, vmIPNets). + Should(Succeed(), "should have de-allocated target VM IP after termination") + } + + // FIXME: for some reason we don't remove stale NATs of migrated + // VMs. One possible reason is if VMs can migrate within the + // same node and we can race between creation and deletion. Can + // it happen? + // https://github.com/ovn-kubernetes/ovn-kubernetes/issues/5627 + expectedNATs := map[string][]*nbdb.NAT{} + if _, isLocal := fakeOvn.controller.localZoneNodes.Load(deleteFirst.nodeName); isLocal { + _, nats := composeNats(deleteFirst) + expectedNATs[deleteFirstRouter.Name] = nats + } + Eventually(fakeOvn.nbClient).Should( + libovsdb.HaveData(expectedNBDBAfterCleanup(expectedStaticRoutes, expectedNATs)), + "should cleanup terminated target pod data from ovn", + ) return nil } @@ -1261,7 +1410,18 @@ var _ = Describe("OVN Kubevirt Operations", func() { zone: kubevirt.OvnRemoteZone, }, }, - testVirtLauncherPod: virtLauncher2(node1, vm1), + testVirtLauncherPod: testVirtLauncherPod{ + suffix: "1", + testPod: testPod{ + nodeName: node2, + }, + vmName: vm1, + skipPodAnnotations: false, /* add ovn pod annotation */ + }, + migrationTarget: testMigrationTarget{ + lrpNetworks: []string{nodeByName[node1].lrpNetworkIPv4, nodeByName[node1].lrpNetworkIPv6}, + testVirtLauncherPod: virtLauncher2(node1, vm1), + }, expectedDhcpv4: []testDHCPOptions{{ cidr: nodeByName[node1].subnetIPv4, dns: dnsServiceIPv4, diff --git a/go-controller/pkg/ovn/ovn.go b/go-controller/pkg/ovn/ovn.go index d935bca85f..280e41eba3 100644 --- a/go-controller/pkg/ovn/ovn.go +++ b/go-controller/pkg/ovn/ovn.go @@ -335,15 +335,27 @@ func (oc *DefaultNetworkController) removeRemoteZonePod(pod *corev1.Pod) error { return fmt.Errorf("failed to remove the remote zone pod: %w", err) } + // FIXME: there are other things we are probably leaving behind and should + // be removed for completed VMs, like per-pod SNAT. Also + // removeRemoteZonePodFromNamespaceAddressSet above should probably not be + // called for migrations. + // https://github.com/ovn-kubernetes/ovn-kubernetes/issues/5627 if kubevirt.IsPodLiveMigratable(pod) { - ips, err := util.GetPodCIDRsWithFullMask(pod, oc.GetNetInfo()) - if err != nil && !errors.Is(err, util.ErrNoPodIPFound) { - return fmt.Errorf("failed to get pod ips for the pod %s/%s: %w", pod.Namespace, pod.Name, err) + allVMPodsAreCompleted, err := kubevirt.AllVMPodsAreCompleted(oc.watchFactory, pod) + if err != nil { + return err } - switchName, zoneContainsPodSubnet := kubevirt.ZoneContainsPodSubnet(oc.lsManager, ips) - if zoneContainsPodSubnet { - if err := oc.lsManager.ReleaseIPs(switchName, ips); err != nil { - return err + + if allVMPodsAreCompleted { + ips, err := util.GetPodCIDRsWithFullMask(pod, oc.GetNetInfo()) + if err != nil && !errors.Is(err, util.ErrNoPodIPFound) { + return fmt.Errorf("failed to get pod ips for the pod %s/%s: %w", pod.Namespace, pod.Name, err) + } + switchName, zoneContainsPodSubnet := kubevirt.ZoneContainsPodSubnet(oc.lsManager, ips) + if zoneContainsPodSubnet { + if err := oc.lsManager.ReleaseIPs(switchName, ips); err != nil { + return err + } } } } diff --git a/go-controller/pkg/ovn/pods.go b/go-controller/pkg/ovn/pods.go index 0ad9442e3e..80c431ef13 100644 --- a/go-controller/pkg/ovn/pods.go +++ b/go-controller/pkg/ovn/pods.go @@ -211,6 +211,23 @@ func (oc *DefaultNetworkController) deleteLogicalPort(pod *corev1.Pod, portInfo return fmt.Errorf("cannot delete GW Routes for pod %s: %w", podDesc, err) } + if kubevirt.IsPodLiveMigratable(pod) { + switchName, hasLocalIPs := oc.lsManager.GetSubnetName(pInfo.ips) + // don't attempt to release IPs that are not managed by this zone which can + // happen with live migratable pods, otherwise we would get distracting + // error logs on release + if !hasLocalIPs { + klog.V(5).Infof("Inhibiting release of live migratable pod %s/%s IPs %s not managed by this zone", + pod.Namespace, pod.Name, + util.JoinIPNetIPs(pInfo.ips, " "), + ) + return nil + } + // a pod might have migrated from one node to another node in the same + // zone so fix the switch for which the release needs to happen + pInfo.logicalSwitch = switchName + } + // Releasing IPs needs to happen last so that we can deterministically know that if delete failed that // the IP of the pod needs to be released. Otherwise we could have a completed pod failed to be removed // and we dont know if the IP was released or not, and subsequently could accidentally release the IP diff --git a/go-controller/pkg/util/egressip/net.go b/go-controller/pkg/util/egressip/net.go new file mode 100644 index 0000000000..018e6d27f6 --- /dev/null +++ b/go-controller/pkg/util/egressip/net.go @@ -0,0 +1,39 @@ +package egressip + +import ( + "math" + "net" + + "github.com/vishvananda/netlink" + "golang.org/x/sys/unix" + + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" +) + +// GetNetlinkAddress returns a netlink address configured with specific +// egress ip parameters +func GetNetlinkAddress(ip net.IP, ifindex int) *netlink.Addr { + return &netlink.Addr{ + IPNet: &net.IPNet{IP: ip, Mask: util.GetIPFullMask(ip)}, + Flags: getNetlinkAddressFlag(ip), + Scope: int(netlink.SCOPE_UNIVERSE), + ValidLft: getNetlinkAddressValidLft(ip), + LinkIndex: ifindex, + } +} + +func getNetlinkAddressFlag(ip net.IP) int { + // isV6? + if ip != nil && ip.To4() == nil && ip.To16() != nil { + return unix.IFA_F_NODAD + } + return 0 +} + +func getNetlinkAddressValidLft(ip net.IP) int { + // isV6? + if ip != nil && ip.To4() == nil && ip.To16() != nil { + return math.MaxUint32 + } + return 0 +} diff --git a/helm/ovn-kubernetes/values-multi-node-zone.yaml b/helm/ovn-kubernetes/values-multi-node-zone.yaml index 8056461256..e3d1c33db7 100644 --- a/helm/ovn-kubernetes/values-multi-node-zone.yaml +++ b/helm/ovn-kubernetes/values-multi-node-zone.yaml @@ -14,7 +14,7 @@ tags: # -- Endpoint of Kubernetes api server k8sAPIServer: https://172.25.0.2:6443 -# -- IP range for Kubernetes pods, /14 is the top level range, under which each /23 range will be assigned to a node +# -- IP range for Kubernetes pods, /16 is the top level range, under which each /24 range will be assigned to a node podNetwork: 10.244.0.0/16/24 # -- A comma-separated set of CIDR notation IP ranges from which k8s assigns service cluster IPs. This should be the same as the value provided for kube-apiserver "--service-cluster-ip-range" option serviceNetwork: 10.96.0.0/16 diff --git a/helm/ovn-kubernetes/values-no-ic.yaml b/helm/ovn-kubernetes/values-no-ic.yaml index f643f81133..a02d849cd9 100644 --- a/helm/ovn-kubernetes/values-no-ic.yaml +++ b/helm/ovn-kubernetes/values-no-ic.yaml @@ -12,7 +12,7 @@ tags: # -- Endpoint of Kubernetes api server k8sAPIServer: https://172.25.0.2:6443 -# -- IP range for Kubernetes pods, /14 is the top level range, under which each /23 range will be assigned to a node +# -- IP range for Kubernetes pods, /16 is the top level range, under which each /24 range will be assigned to a node podNetwork: 10.244.0.0/16/24 # -- A comma-separated set of CIDR notation IP ranges from which k8s assigns service cluster IPs. This should be the same as the value provided for kube-apiserver "--service-cluster-ip-range" option serviceNetwork: 10.96.0.0/16 diff --git a/helm/ovn-kubernetes/values-single-node-zone.yaml b/helm/ovn-kubernetes/values-single-node-zone.yaml index 516b77220b..3787ad6823 100644 --- a/helm/ovn-kubernetes/values-single-node-zone.yaml +++ b/helm/ovn-kubernetes/values-single-node-zone.yaml @@ -14,7 +14,7 @@ tags: # -- Endpoint of Kubernetes api server k8sAPIServer: https://172.25.0.2:6443 -# -- IP range for Kubernetes pods, /14 is the top level range, under which each /23 range will be assigned to a node +# -- IP range for Kubernetes pods, /16 is the top level range, under which each /24 range will be assigned to a node podNetwork: 10.244.0.0/16/24 # -- A comma-separated set of CIDR notation IP ranges from which k8s assigns service cluster IPs. This should be the same as the value provided for kube-apiserver "--service-cluster-ip-range" option serviceNetwork: 10.96.0.0/16 diff --git a/mkdocs.yml b/mkdocs.yml index 45f5a277e0..925f2f96d6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -155,5 +155,7 @@ nav: - Preconfigured UDN Addresses: okeps/okep-5233-preconfigured-udn-addresses.md - BGP: okeps/okep-5296-bgp.md - Layer2TransitRouter: okeps/okep-5094-layer2-transit-router.md + - MCP for Troubleshooting: okeps/okep-5494-ovn-kubernetes-mcp-server.md + - Dynamic UDN Node Allocation: okeps/okep-5552-dynamic-udn-node-allocation.md - Blog: - blog/index.md diff --git a/test/e2e/kubevirt.go b/test/e2e/kubevirt.go index 2dfaf5e2e2..c545a1959c 100644 --- a/test/e2e/kubevirt.go +++ b/test/e2e/kubevirt.go @@ -998,7 +998,7 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun "kubevirt.io/allow-pod-bridge-network-live-migration": "", } nodeSelector := map[string]string{ - namespace: "", + namespace: "true", } networkSource := kubevirtv1.NetworkSource{ Pod: &kubevirtv1.PodNetwork{}, @@ -1119,7 +1119,7 @@ passwd: by(vm.Name, "Live migrate for the third time to the node owning the subnet") // Patch back the original node with the label and remove it // from the rest of nodes to force live migration target to it. - e2enode.AddOrUpdateLabelOnNode(fr.ClientSet, originalNode, namespace, "") + e2enode.AddOrUpdateLabelOnNode(fr.ClientSet, originalNode, namespace, "true") for _, selectedNode := range selectedNodes { if selectedNode.Name != originalNode { e2enode.RemoveLabelOffNode(fr.ClientSet, selectedNode.Name, namespace) @@ -1405,7 +1405,7 @@ fi // configure VM nodeSelector with it and live migration will take only // them into consideration for _, node := range selectedNodes { - e2enode.AddOrUpdateLabelOnNode(fr.ClientSet, node.Name, namespace, "") + e2enode.AddOrUpdateLabelOnNode(fr.ClientSet, node.Name, namespace, "true") } prepareHTTPServerPods(map[string]string{}, checkPodHasIPAtStatus) diff --git a/test/e2e/network_segmentation_preconfigured_layer2.go b/test/e2e/network_segmentation_preconfigured_layer2.go index 5f7a32c85c..ea360fa38e 100644 --- a/test/e2e/network_segmentation_preconfigured_layer2.go +++ b/test/e2e/network_segmentation_preconfigured_layer2.go @@ -121,7 +121,7 @@ var _ = Describe("Network Segmentation: Preconfigured Layer2 UDN", feature.Netwo role: "primary", defaultGatewayIPs: joinStrings("172.31.0.10", "2014:100:200::100"), reservedCIDRs: joinStrings("172.31.1.0/24", "2014:100:200::/122"), - infrastructureCIDRs: joinStrings("172.31.0.10/30", "2014:100:200::100/122"), + infrastructureCIDRs: joinStrings("172.31.0.8/30", "2014:100:200::100/122"), }, expectedGatewayIPs: []string{"172.31.0.10", "2014:100:200::100"}, }), @@ -137,6 +137,77 @@ var _ = Describe("Network Segmentation: Preconfigured Layer2 UDN", feature.Netwo }), ) + type invalidAPITestConfig struct { + netConfig *networkAttachmentConfigParams + expectedError interface{} + } + DescribeTable("unmasked reserved / infrastructure subnets are not allowed", + func(config invalidAPITestConfig) { + podIPs := filterCIDRs(f.ClientSet, config.netConfig.cidr) + if len(podIPs) == 0 { + Skip("IP family not supported in this environment") + } + + By("creating the L2 network") + netConfig := config.netConfig + + netConfig.namespace = f.Namespace.Name + + udnManifest := generateUserDefinedNetworkManifest(netConfig, f.ClientSet) + cleanup, err := createManifest(netConfig.namespace, udnManifest) + Expect(err).To(MatchError(config.expectedError)) + DeferCleanup(cleanup) + }, + Entry("Layer2 with unmasked IPv4 reserved subnets", invalidAPITestConfig{ + netConfig: &networkAttachmentConfigParams{ + name: "invalid-l2-net-reserved-subnets", + topology: "layer2", + cidr: "172.31.0.0/16", + role: "primary", + reservedCIDRs: "172.31.0.10/30", + }, + expectedError: ContainSubstring( + "Invalid value: \"object\": reservedSubnets must be a masked network address (no host bits set)", + ), + }), + Entry("Layer2 with unmasked IPv6 reserved subnets", invalidAPITestConfig{ + netConfig: &networkAttachmentConfigParams{ + name: "invalid-l2-net-reserved-subnets", + topology: "layer2", + cidr: "2014:100:200::0/60", + role: "primary", + reservedCIDRs: "2014:100:200::88/122", + }, + expectedError: ContainSubstring( + "Invalid value: \"object\": reservedSubnets must be a masked network address (no host bits set)", + ), + }), + Entry("Layer2 with unmasked IPv4 infrastructure subnets", invalidAPITestConfig{ + netConfig: &networkAttachmentConfigParams{ + name: "invalid-l2-net-infra-subnets", + topology: "layer2", + cidr: "172.31.0.0/16", + role: "primary", + infrastructureCIDRs: "172.31.0.10/30", + }, + expectedError: ContainSubstring( + "Invalid value: \"object\": infrastructureSubnets must be a masked network address (no host bits set)", + ), + }), + Entry("Layer2 with unmasked IPv6 infrastructure subnets", invalidAPITestConfig{ + netConfig: &networkAttachmentConfigParams{ + name: "invalid-l2-net-infra-subnets", + topology: "layer2", + cidr: "2014:100:200::0/60", + role: "primary", + infrastructureCIDRs: "2014:100:200::88/122", + }, + expectedError: ContainSubstring( + "Invalid value: \"object\": infrastructureSubnets must be a masked network address (no host bits set)", + ), + }), + ) + Context("duplicate IP validation with primary UDN layer 2 pods", func() { const ( duplicateIPv4 = "10.128.0.200/16"