From 200eaf59540a073aada537a86b56ad254c690218 Mon Sep 17 00:00:00 2001 From: Riccardo Ravaioli Date: Wed, 30 Mar 2022 17:52:44 +0200 Subject: [PATCH] use generic watchResource with retry logic watchResource() incorporates the whole retry logic seen for network policies and pods and can easily be instantiated for any resource type. Applied it to pods, nodes, network policies and network policy dynamic handlers. Signed-off-by: Riccardo Ravaioli (cherry picked from commit 24a503a7c76fd4d4552a06858d03a55d4568a975) --- go-controller/pkg/factory/factory.go | 207 ++-- go-controller/pkg/factory/factory_test.go | 74 +- go-controller/pkg/factory/handler.go | 18 +- go-controller/pkg/ovn/gress_policy.go | 3 +- go-controller/pkg/ovn/master.go | 95 +- go-controller/pkg/ovn/namespace.go | 10 +- go-controller/pkg/ovn/obj_retry.go | 1067 ++++++++++++++++++++- go-controller/pkg/ovn/ovn.go | 398 +------- go-controller/pkg/ovn/pods.go | 6 - go-controller/pkg/ovn/pods_retry.go | 228 ----- go-controller/pkg/ovn/pods_test.go | 75 +- go-controller/pkg/ovn/policy.go | 419 +++----- go-controller/pkg/ovn/policy_test.go | 22 +- 13 files changed, 1456 insertions(+), 1166 deletions(-) delete mode 100644 go-controller/pkg/ovn/pods_retry.go diff --git a/go-controller/pkg/factory/factory.go b/go-controller/pkg/factory/factory.go index a5a0652481..45ea03b69d 100644 --- a/go-controller/pkg/factory/factory.go +++ b/go-controller/pkg/factory/factory.go @@ -70,16 +70,30 @@ const ( defaultNumEventQueues uint32 = 15 ) +// types for dynamic handlers created when adding a network policy +type peerService struct{} +type peerNamespaceAndPodSelector struct{} +type peerPodForNamespaceAndPodSelector struct{} // created during the add function of peerNamespaceAndPodSelectorType +type peerNamespaceSelector struct{} +type peerPodSelector struct{} +type localPodSelector struct{} + var ( - podType reflect.Type = reflect.TypeOf(&kapi.Pod{}) - serviceType reflect.Type = reflect.TypeOf(&kapi.Service{}) - endpointsType reflect.Type = reflect.TypeOf(&kapi.Endpoints{}) - policyType reflect.Type = reflect.TypeOf(&knet.NetworkPolicy{}) - namespaceType reflect.Type = reflect.TypeOf(&kapi.Namespace{}) - nodeType reflect.Type = reflect.TypeOf(&kapi.Node{}) - egressFirewallType reflect.Type = reflect.TypeOf(&egressfirewallapi.EgressFirewall{}) - egressIPType reflect.Type = reflect.TypeOf(&egressipapi.EgressIP{}) - cloudPrivateIPConfigType reflect.Type = reflect.TypeOf(&ocpcloudnetworkapi.CloudPrivateIPConfig{}) + PodType reflect.Type = reflect.TypeOf(&kapi.Pod{}) + ServiceType reflect.Type = reflect.TypeOf(&kapi.Service{}) + EndpointsType reflect.Type = reflect.TypeOf(&kapi.Endpoints{}) + PolicyType reflect.Type = reflect.TypeOf(&knet.NetworkPolicy{}) + NamespaceType reflect.Type = reflect.TypeOf(&kapi.Namespace{}) + NodeType reflect.Type = reflect.TypeOf(&kapi.Node{}) + EgressFirewallType reflect.Type = reflect.TypeOf(&egressfirewallapi.EgressFirewall{}) + EgressIPType reflect.Type = reflect.TypeOf(&egressipapi.EgressIP{}) + CloudPrivateIPConfigType reflect.Type = reflect.TypeOf(&ocpcloudnetworkapi.CloudPrivateIPConfig{}) + PeerServiceType reflect.Type = reflect.TypeOf(&peerService{}) + PeerNamespaceAndPodSelectorType reflect.Type = reflect.TypeOf(&peerNamespaceAndPodSelector{}) + PeerPodForNamespaceAndPodSelectorType reflect.Type = reflect.TypeOf(&peerPodForNamespaceAndPodSelector{}) + PeerNamespaceSelectorType reflect.Type = reflect.TypeOf(&peerNamespaceSelector{}) + PeerPodSelectorType reflect.Type = reflect.TypeOf(&peerPodSelector{}) + LocalPodSelectorType reflect.Type = reflect.TypeOf(&localPodSelector{}) ) // NewMasterWatchFactory initializes a new watch factory for the master or master+node processes. @@ -129,48 +143,48 @@ func NewMasterWatchFactory(ovnClientset *util.OVNClientset) (*WatchFactory, erro var err error // Create our informer-wrapper informer (and underlying shared informer) for types we need - wf.informers[podType], err = newQueuedInformer(podType, wf.iFactory.Core().V1().Pods().Informer(), wf.stopChan, + wf.informers[PodType], err = newQueuedInformer(PodType, wf.iFactory.Core().V1().Pods().Informer(), wf.stopChan, defaultNumEventQueues) if err != nil { return nil, err } - wf.informers[serviceType], err = newInformer(serviceType, wf.iFactory.Core().V1().Services().Informer()) + wf.informers[ServiceType], err = newInformer(ServiceType, wf.iFactory.Core().V1().Services().Informer()) if err != nil { return nil, err } - wf.informers[endpointsType], err = newInformer(endpointsType, wf.iFactory.Core().V1().Endpoints().Informer()) + wf.informers[EndpointsType], err = newInformer(EndpointsType, wf.iFactory.Core().V1().Endpoints().Informer()) if err != nil { return nil, err } - wf.informers[policyType], err = newInformer(policyType, wf.iFactory.Networking().V1().NetworkPolicies().Informer()) + wf.informers[PolicyType], err = newInformer(PolicyType, wf.iFactory.Networking().V1().NetworkPolicies().Informer()) if err != nil { return nil, err } - wf.informers[namespaceType], err = newQueuedInformer(namespaceType, wf.iFactory.Core().V1().Namespaces().Informer(), + wf.informers[NamespaceType], err = newQueuedInformer(NamespaceType, wf.iFactory.Core().V1().Namespaces().Informer(), wf.stopChan, defaultNumEventQueues) if err != nil { return nil, err } - wf.informers[nodeType], err = newQueuedInformer(nodeType, wf.iFactory.Core().V1().Nodes().Informer(), wf.stopChan, + wf.informers[NodeType], err = newQueuedInformer(NodeType, wf.iFactory.Core().V1().Nodes().Informer(), wf.stopChan, defaultNumEventQueues) if err != nil { return nil, err } if config.OVNKubernetesFeature.EnableEgressIP { - wf.informers[egressIPType], err = newInformer(egressIPType, wf.eipFactory.K8s().V1().EgressIPs().Informer()) + wf.informers[EgressIPType], err = newInformer(EgressIPType, wf.eipFactory.K8s().V1().EgressIPs().Informer()) if err != nil { return nil, err } } if config.OVNKubernetesFeature.EnableEgressFirewall { - wf.informers[egressFirewallType], err = newInformer(egressFirewallType, wf.efFactory.K8s().V1().EgressFirewalls().Informer()) + wf.informers[EgressFirewallType], err = newInformer(EgressFirewallType, wf.efFactory.K8s().V1().EgressFirewalls().Informer()) if err != nil { return nil, err } } if util.PlatformTypeIsEgressIPCloudProvider() { - wf.informers[cloudPrivateIPConfigType], err = newInformer(cloudPrivateIPConfigType, wf.cpipcFactory.Cloud().V1().CloudPrivateIPConfigs().Informer()) + wf.informers[CloudPrivateIPConfigType], err = newInformer(CloudPrivateIPConfigType, wf.cpipcFactory.Cloud().V1().CloudPrivateIPConfigs().Informer()) if err != nil { return nil, err } @@ -254,21 +268,21 @@ func NewNodeWatchFactory(ovnClientset *util.OVNClientset, nodeName string) (*Wat }) var err error - wf.informers[podType], err = newQueuedInformer(podType, wf.iFactory.Core().V1().Pods().Informer(), wf.stopChan, + wf.informers[PodType], err = newQueuedInformer(PodType, wf.iFactory.Core().V1().Pods().Informer(), wf.stopChan, defaultNumEventQueues) if err != nil { return nil, err } - wf.informers[serviceType], err = newInformer(serviceType, wf.iFactory.Core().V1().Services().Informer()) + wf.informers[ServiceType], err = newInformer(ServiceType, wf.iFactory.Core().V1().Services().Informer()) if err != nil { return nil, err } - wf.informers[endpointsType], err = newInformer(endpointsType, wf.iFactory.Core().V1().Endpoints().Informer()) + wf.informers[EndpointsType], err = newInformer(EndpointsType, wf.iFactory.Core().V1().Endpoints().Informer()) if err != nil { return nil, err } - wf.informers[nodeType], err = newInformer(nodeType, wf.iFactory.Core().V1().Nodes().Informer()) + wf.informers[NodeType], err = newInformer(NodeType, wf.iFactory.Core().V1().Nodes().Informer()) if err != nil { return nil, err } @@ -287,39 +301,39 @@ func (wf *WatchFactory) Shutdown() { func getObjectMeta(objType reflect.Type, obj interface{}) (*metav1.ObjectMeta, error) { switch objType { - case podType: + case PodType: if pod, ok := obj.(*kapi.Pod); ok { return &pod.ObjectMeta, nil } - case serviceType: + case ServiceType: if service, ok := obj.(*kapi.Service); ok { return &service.ObjectMeta, nil } - case endpointsType: + case EndpointsType: if endpoints, ok := obj.(*kapi.Endpoints); ok { return &endpoints.ObjectMeta, nil } - case policyType: + case PolicyType: if policy, ok := obj.(*knet.NetworkPolicy); ok { return &policy.ObjectMeta, nil } - case namespaceType: + case NamespaceType: if namespace, ok := obj.(*kapi.Namespace); ok { return &namespace.ObjectMeta, nil } - case nodeType: + case NodeType: if node, ok := obj.(*kapi.Node); ok { return &node.ObjectMeta, nil } - case egressFirewallType: + case EgressFirewallType: if egressFirewall, ok := obj.(*egressfirewallapi.EgressFirewall); ok { return &egressFirewall.ObjectMeta, nil } - case egressIPType: + case EgressIPType: if egressIP, ok := obj.(*egressipapi.EgressIP); ok { return &egressIP.ObjectMeta, nil } - case cloudPrivateIPConfigType: + case CloudPrivateIPConfigType: if cloudPrivateIPConfig, ok := obj.(*ocpcloudnetworkapi.CloudPrivateIPConfig); ok { return &cloudPrivateIPConfig.ObjectMeta, nil } @@ -327,6 +341,49 @@ func getObjectMeta(objType reflect.Type, obj interface{}) (*metav1.ObjectMeta, e return nil, fmt.Errorf("cannot get ObjectMeta from type %v", objType) } +type AddHandlerFuncType func(namespace string, sel labels.Selector, funcs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler + +func (wf *WatchFactory) GetResourceHandlerFunc(objType reflect.Type) (AddHandlerFuncType, error) { + switch objType { + case PolicyType: + return func(namespace string, sel labels.Selector, + funcs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { + return wf.AddPolicyHandler(funcs, processExisting) + }, nil + + case NodeType: + return func(namespace string, sel labels.Selector, + funcs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { + return wf.AddNodeHandler(funcs, processExisting) + }, nil + + case PeerServiceType: + return func(namespace string, sel labels.Selector, + funcs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { + return wf.AddFilteredServiceHandler(namespace, funcs, processExisting) + }, nil + + case PeerPodSelectorType, LocalPodSelectorType, PodType: + return func(namespace string, sel labels.Selector, + funcs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { + return wf.AddFilteredPodHandler(namespace, sel, funcs, processExisting) + }, nil + + case PeerNamespaceAndPodSelectorType, PeerNamespaceSelectorType: + return func(namespace string, sel labels.Selector, + funcs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { + return wf.AddFilteredNamespaceHandler(namespace, sel, funcs, processExisting) + }, nil + + case PeerPodForNamespaceAndPodSelectorType: + return func(namespace string, sel labels.Selector, + funcs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { + return wf.AddFilteredPodHandler(namespace, sel, funcs, processExisting) + }, nil + } + return nil, fmt.Errorf("cannot get ObjectMeta from type %v", objType) +} + func (wf *WatchFactory) addHandler(objType reflect.Type, namespace string, sel labels.Selector, funcs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { inf, ok := wf.informers[objType] if !ok { @@ -379,140 +436,140 @@ func (wf *WatchFactory) removeHandler(objType reflect.Type, handler *Handler) { // AddPodHandler adds a handler function that will be executed on Pod object changes func (wf *WatchFactory) AddPodHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(podType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(PodType, "", nil, handlerFuncs, processExisting) } // AddFilteredPodHandler adds a handler function that will be executed when Pod objects that match the given filters change func (wf *WatchFactory) AddFilteredPodHandler(namespace string, sel labels.Selector, handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(podType, namespace, sel, handlerFuncs, processExisting) + return wf.addHandler(PodType, namespace, sel, handlerFuncs, processExisting) } // RemovePodHandler removes a Pod object event handler function func (wf *WatchFactory) RemovePodHandler(handler *Handler) { - wf.removeHandler(podType, handler) + wf.removeHandler(PodType, handler) } // AddServiceHandler adds a handler function that will be executed on Service object changes func (wf *WatchFactory) AddServiceHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(serviceType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(ServiceType, "", nil, handlerFuncs, processExisting) } // AddFilteredServiceHandler adds a handler function that will be executed on all Service object changes for a specific namespace func (wf *WatchFactory) AddFilteredServiceHandler(namespace string, handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(serviceType, namespace, nil, handlerFuncs, processExisting) + return wf.addHandler(ServiceType, namespace, nil, handlerFuncs, processExisting) } // RemoveServiceHandler removes a Service object event handler function func (wf *WatchFactory) RemoveServiceHandler(handler *Handler) { - wf.removeHandler(serviceType, handler) + wf.removeHandler(ServiceType, handler) } // AddEndpointsHandler adds a handler function that will be executed on Endpoints object changes func (wf *WatchFactory) AddEndpointsHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(endpointsType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(EndpointsType, "", nil, handlerFuncs, processExisting) } // AddFilteredEndpointsHandler adds a handler function that will be executed when Endpoint objects that match the given filters change func (wf *WatchFactory) AddFilteredEndpointsHandler(namespace string, sel labels.Selector, handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(endpointsType, namespace, sel, handlerFuncs, processExisting) + return wf.addHandler(EndpointsType, namespace, sel, handlerFuncs, processExisting) } // RemoveEndpointsHandler removes a Endpoints object event handler function func (wf *WatchFactory) RemoveEndpointsHandler(handler *Handler) { - wf.removeHandler(endpointsType, handler) + wf.removeHandler(EndpointsType, handler) } // AddPolicyHandler adds a handler function that will be executed on NetworkPolicy object changes func (wf *WatchFactory) AddPolicyHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(policyType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(PolicyType, "", nil, handlerFuncs, processExisting) } // RemovePolicyHandler removes a NetworkPolicy object event handler function func (wf *WatchFactory) RemovePolicyHandler(handler *Handler) { - wf.removeHandler(policyType, handler) + wf.removeHandler(PolicyType, handler) } // AddEgressFirewallHandler adds a handler function that will be executed on EgressFirewall object changes func (wf *WatchFactory) AddEgressFirewallHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(egressFirewallType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(EgressFirewallType, "", nil, handlerFuncs, processExisting) } // RemoveEgressFirewallHandler removes an EgressFirewall object event handler function func (wf *WatchFactory) RemoveEgressFirewallHandler(handler *Handler) { - wf.removeHandler(egressFirewallType, handler) + wf.removeHandler(EgressFirewallType, handler) } // AddEgressIPHandler adds a handler function that will be executed on EgressIP object changes func (wf *WatchFactory) AddEgressIPHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(egressIPType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(EgressIPType, "", nil, handlerFuncs, processExisting) } // RemoveEgressIPHandler removes an EgressIP object event handler function func (wf *WatchFactory) RemoveEgressIPHandler(handler *Handler) { - wf.removeHandler(egressIPType, handler) + wf.removeHandler(EgressIPType, handler) } // AddCloudPrivateIPConfigHandler adds a handler function that will be executed on CloudPrivateIPConfig object changes func (wf *WatchFactory) AddCloudPrivateIPConfigHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(cloudPrivateIPConfigType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(CloudPrivateIPConfigType, "", nil, handlerFuncs, processExisting) } // RemoveCloudPrivateIPConfigHandler removes an CloudPrivateIPConfig object event handler function func (wf *WatchFactory) RemoveCloudPrivateIPConfigHandler(handler *Handler) { - wf.removeHandler(cloudPrivateIPConfigType, handler) + wf.removeHandler(CloudPrivateIPConfigType, handler) } // AddNamespaceHandler adds a handler function that will be executed on Namespace object changes func (wf *WatchFactory) AddNamespaceHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(namespaceType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(NamespaceType, "", nil, handlerFuncs, processExisting) } // AddFilteredNamespaceHandler adds a handler function that will be executed when Namespace objects that match the given filters change func (wf *WatchFactory) AddFilteredNamespaceHandler(namespace string, sel labels.Selector, handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(namespaceType, namespace, sel, handlerFuncs, processExisting) + return wf.addHandler(NamespaceType, namespace, sel, handlerFuncs, processExisting) } // RemoveNamespaceHandler removes a Namespace object event handler function func (wf *WatchFactory) RemoveNamespaceHandler(handler *Handler) { - wf.removeHandler(namespaceType, handler) + wf.removeHandler(NamespaceType, handler) } // AddNodeHandler adds a handler function that will be executed on Node object changes func (wf *WatchFactory) AddNodeHandler(handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(nodeType, "", nil, handlerFuncs, processExisting) + return wf.addHandler(NodeType, "", nil, handlerFuncs, processExisting) } // AddFilteredNodeHandler dds a handler function that will be executed when Node objects that match the given label selector func (wf *WatchFactory) AddFilteredNodeHandler(sel labels.Selector, handlerFuncs cache.ResourceEventHandler, processExisting func([]interface{})) *Handler { - return wf.addHandler(nodeType, "", sel, handlerFuncs, processExisting) + return wf.addHandler(NodeType, "", sel, handlerFuncs, processExisting) } // RemoveNodeHandler removes a Node object event handler function func (wf *WatchFactory) RemoveNodeHandler(handler *Handler) { - wf.removeHandler(nodeType, handler) + wf.removeHandler(NodeType, handler) } // GetPod returns the pod spec given the namespace and pod name func (wf *WatchFactory) GetPod(namespace, name string) (*kapi.Pod, error) { - podLister := wf.informers[podType].lister.(listers.PodLister) + podLister := wf.informers[PodType].lister.(listers.PodLister) return podLister.Pods(namespace).Get(name) } // GetAllPods returns all the pods in the cluster func (wf *WatchFactory) GetAllPods() ([]*kapi.Pod, error) { - podLister := wf.informers[podType].lister.(listers.PodLister) + podLister := wf.informers[PodType].lister.(listers.PodLister) return podLister.List(labels.Everything()) } // GetPods returns all the pods in a given namespace func (wf *WatchFactory) GetPods(namespace string) ([]*kapi.Pod, error) { - podLister := wf.informers[podType].lister.(listers.PodLister) + podLister := wf.informers[PodType].lister.(listers.PodLister) return podLister.Pods(namespace).List(labels.Everything()) } // GetPodsBySelector returns all the pods in a given namespace by the label selector func (wf *WatchFactory) GetPodsBySelector(namespace string, labelSelector metav1.LabelSelector) ([]*kapi.Pod, error) { - podLister := wf.informers[podType].lister.(listers.PodLister) + podLister := wf.informers[PodType].lister.(listers.PodLister) selector, err := metav1.LabelSelectorAsSelector(&labelSelector) if err != nil { return nil, err @@ -527,64 +584,64 @@ func (wf *WatchFactory) GetNodes() ([]*kapi.Node, error) { // ListNodes returns nodes that match a selector func (wf *WatchFactory) ListNodes(selector labels.Selector) ([]*kapi.Node, error) { - nodeLister := wf.informers[nodeType].lister.(listers.NodeLister) + nodeLister := wf.informers[NodeType].lister.(listers.NodeLister) return nodeLister.List(selector) } // GetNode returns the node spec of a given node by name func (wf *WatchFactory) GetNode(name string) (*kapi.Node, error) { - nodeLister := wf.informers[nodeType].lister.(listers.NodeLister) + nodeLister := wf.informers[NodeType].lister.(listers.NodeLister) return nodeLister.Get(name) } // GetService returns the service spec of a service in a given namespace func (wf *WatchFactory) GetService(namespace, name string) (*kapi.Service, error) { - serviceLister := wf.informers[serviceType].lister.(listers.ServiceLister) + serviceLister := wf.informers[ServiceType].lister.(listers.ServiceLister) return serviceLister.Services(namespace).Get(name) } // GetEndpoints returns the endpoints list in a given namespace func (wf *WatchFactory) GetEndpoints(namespace string) ([]*kapi.Endpoints, error) { - endpointsLister := wf.informers[endpointsType].lister.(listers.EndpointsLister) + endpointsLister := wf.informers[EndpointsType].lister.(listers.EndpointsLister) return endpointsLister.Endpoints(namespace).List(labels.Everything()) } // GetEndpoint returns a specific endpoint in a given namespace func (wf *WatchFactory) GetEndpoint(namespace, name string) (*kapi.Endpoints, error) { - endpointsLister := wf.informers[endpointsType].lister.(listers.EndpointsLister) + endpointsLister := wf.informers[EndpointsType].lister.(listers.EndpointsLister) return endpointsLister.Endpoints(namespace).Get(name) } func (wf *WatchFactory) GetCloudPrivateIPConfig(name string) (*ocpcloudnetworkapi.CloudPrivateIPConfig, error) { - cloudPrivateIPConfigLister := wf.informers[cloudPrivateIPConfigType].lister.(ocpcloudnetworklister.CloudPrivateIPConfigLister) + cloudPrivateIPConfigLister := wf.informers[CloudPrivateIPConfigType].lister.(ocpcloudnetworklister.CloudPrivateIPConfigLister) return cloudPrivateIPConfigLister.Get(name) } func (wf *WatchFactory) GetEgressIP(name string) (*egressipapi.EgressIP, error) { - egressIPLister := wf.informers[egressIPType].lister.(egressiplister.EgressIPLister) + egressIPLister := wf.informers[EgressIPType].lister.(egressiplister.EgressIPLister) return egressIPLister.Get(name) } func (wf *WatchFactory) GetEgressIPs() ([]*egressipapi.EgressIP, error) { - egressIPLister := wf.informers[egressIPType].lister.(egressiplister.EgressIPLister) + egressIPLister := wf.informers[EgressIPType].lister.(egressiplister.EgressIPLister) return egressIPLister.List(labels.Everything()) } // GetNamespace returns a specific namespace func (wf *WatchFactory) GetNamespace(name string) (*kapi.Namespace, error) { - namespaceLister := wf.informers[namespaceType].lister.(listers.NamespaceLister) + namespaceLister := wf.informers[NamespaceType].lister.(listers.NamespaceLister) return namespaceLister.Get(name) } // GetNamespaces returns a list of namespaces in the cluster func (wf *WatchFactory) GetNamespaces() ([]*kapi.Namespace, error) { - namespaceLister := wf.informers[namespaceType].lister.(listers.NamespaceLister) + namespaceLister := wf.informers[NamespaceType].lister.(listers.NamespaceLister) return namespaceLister.List(labels.Everything()) } // GetNamespacesBySelector returns a list of namespaces in the cluster by the label selector func (wf *WatchFactory) GetNamespacesBySelector(labelSelector metav1.LabelSelector) ([]*kapi.Namespace, error) { - namespaceLister := wf.informers[namespaceType].lister.(listers.NamespaceLister) + namespaceLister := wf.informers[NamespaceType].lister.(listers.NamespaceLister) selector, err := metav1.LabelSelectorAsSelector(&labelSelector) if err != nil { return nil, err @@ -594,30 +651,30 @@ func (wf *WatchFactory) GetNamespacesBySelector(labelSelector metav1.LabelSelect // GetNetworkPolicy gets a specific network policy by the namespace/name func (wf *WatchFactory) GetNetworkPolicy(namespace, name string) (*knet.NetworkPolicy, error) { - networkPolicyLister := wf.informers[policyType].lister.(netlisters.NetworkPolicyLister) + networkPolicyLister := wf.informers[PolicyType].lister.(netlisters.NetworkPolicyLister) return networkPolicyLister.NetworkPolicies(namespace).Get(name) } func (wf *WatchFactory) NodeInformer() cache.SharedIndexInformer { - return wf.informers[nodeType].inf + return wf.informers[NodeType].inf } // LocalPodInformer returns a shared Informer that may or may not only // return pods running on the local node. func (wf *WatchFactory) LocalPodInformer() cache.SharedIndexInformer { - return wf.informers[podType].inf + return wf.informers[PodType].inf } func (wf *WatchFactory) PodInformer() cache.SharedIndexInformer { - return wf.informers[podType].inf + return wf.informers[PodType].inf } func (wf *WatchFactory) NamespaceInformer() cache.SharedIndexInformer { - return wf.informers[namespaceType].inf + return wf.informers[NamespaceType].inf } func (wf *WatchFactory) ServiceInformer() cache.SharedIndexInformer { - return wf.informers[serviceType].inf + return wf.informers[ServiceType].inf } // noHeadlessServiceSelector is a LabelSelector added to the watch for diff --git a/go-controller/pkg/factory/factory_test.go b/go-controller/pkg/factory/factory_test.go index ecbfe099d8..ebec52bd32 100644 --- a/go-controller/pkg/factory/factory_test.go +++ b/go-controller/pkg/factory/factory_test.go @@ -343,45 +343,45 @@ var _ = Describe("Watch Factory Operations", func() { It("is called for each existing pod", func() { pods = append(pods, newPod("pod1", "default")) - testExisting(podType, "", nil) + testExisting(PodType, "", nil) }) It("is called for each existing namespace", func() { namespaces = append(namespaces, newNamespace("default")) - testExisting(namespaceType, "", nil) + testExisting(NamespaceType, "", nil) }) It("is called for each existing node", func() { nodes = append(nodes, newNode("default")) - testExisting(nodeType, "", nil) + testExisting(NodeType, "", nil) }) It("is called for each existing policy", func() { policies = append(policies, newPolicy("denyall", "default")) - testExisting(policyType, "", nil) + testExisting(PolicyType, "", nil) }) It("is called for each existing endpoints", func() { endpoints = append(endpoints, newEndpoints("myendpoint", "default")) - testExisting(endpointsType, "", nil) + testExisting(EndpointsType, "", nil) }) It("is called for each existing service", func() { services = append(services, newService("myservice", "default")) - testExisting(serviceType, "", nil) + testExisting(ServiceType, "", nil) }) It("is called for each existing egressFirewall", func() { egressFirewalls = append(egressFirewalls, newEgressFirewall("myEgressFirewall", "default")) - testExisting(egressFirewallType, "", nil) + testExisting(EgressFirewallType, "", nil) }) It("is called for each existing egressIP", func() { egressIPs = append(egressIPs, newEgressIP("myEgressIP", "default")) - testExisting(egressIPType, "", nil) + testExisting(EgressIPType, "", nil) }) It("is called for each existing cloudPrivateIPConfig", func() { cloudPrivateIPConfigs = append(cloudPrivateIPConfigs, newCloudPrivateIPConfig("192.168.176.25")) - testExisting(cloudPrivateIPConfigType, "", nil) + testExisting(CloudPrivateIPConfigType, "", nil) }) It("is called for each existing pod that matches a given namespace and label", func() { @@ -396,7 +396,7 @@ var _ = Describe("Watch Factory Operations", func() { ) Expect(err).NotTo(HaveOccurred()) - testExisting(podType, "default", sel) + testExisting(PodType, "default", sel) }) }) @@ -422,52 +422,52 @@ var _ = Describe("Watch Factory Operations", func() { It("calls ADD for each existing pod", func() { pods = append(pods, newPod("pod1", "default")) pods = append(pods, newPod("pod2", "default")) - testExisting(podType) + testExisting(PodType) }) It("calls ADD for each existing namespace", func() { namespaces = append(namespaces, newNamespace("default")) namespaces = append(namespaces, newNamespace("default2")) - testExisting(namespaceType) + testExisting(NamespaceType) }) It("calls ADD for each existing node", func() { nodes = append(nodes, newNode("default")) nodes = append(nodes, newNode("default2")) - testExisting(nodeType) + testExisting(NodeType) }) It("calls ADD for each existing policy", func() { policies = append(policies, newPolicy("denyall", "default")) policies = append(policies, newPolicy("denyall2", "default")) - testExisting(policyType) + testExisting(PolicyType) }) It("calls ADD for each existing endpoints", func() { endpoints = append(endpoints, newEndpoints("myendpoint", "default")) endpoints = append(endpoints, newEndpoints("myendpoint2", "default")) - testExisting(endpointsType) + testExisting(EndpointsType) }) It("calls ADD for each existing service", func() { services = append(services, newService("myservice", "default")) services = append(services, newService("myservice2", "default")) - testExisting(serviceType) + testExisting(ServiceType) }) It("calls ADD for each existing egressFirewall", func() { egressFirewalls = append(egressFirewalls, newEgressFirewall("myFirewall", "default")) egressFirewalls = append(egressFirewalls, newEgressFirewall("myFirewall1", "default")) - testExisting(egressFirewallType) + testExisting(EgressFirewallType) }) It("calls ADD for each existing egressIP", func() { egressIPs = append(egressIPs, newEgressIP("myEgressIP", "default")) egressIPs = append(egressIPs, newEgressIP("myEgressIP1", "default")) - testExisting(egressIPType) + testExisting(EgressIPType) }) It("calls ADD for each existing cloudPrivateIPConfig", func() { cloudPrivateIPConfigs = append(cloudPrivateIPConfigs, newCloudPrivateIPConfig("192.168.126.25")) cloudPrivateIPConfigs = append(cloudPrivateIPConfigs, newCloudPrivateIPConfig("192.168.126.26")) - testExisting(cloudPrivateIPConfigType) + testExisting(CloudPrivateIPConfigType) }) }) @@ -481,7 +481,7 @@ var _ = Describe("Watch Factory Operations", func() { } It("does not contain Egress IP informer", func() { config.OVNKubernetesFeature.EnableEgressIP = false - testExisting(egressIPType) + testExisting(EgressIPType) }) }) Context("when EgressFirewall is disabled", func() { @@ -494,7 +494,7 @@ var _ = Describe("Watch Factory Operations", func() { } It("does not contain EgressFirewall informer", func() { config.OVNKubernetesFeature.EnableEgressFirewall = false - testExisting(egressFirewallType) + testExisting(EgressFirewallType) }) }) @@ -532,7 +532,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newPod("pod1", "default") - h, c := addHandler(wf, podType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, PodType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { pod := obj.(*v1.Pod) Expect(reflect.DeepEqual(pod, added)).To(BeTrue()) @@ -583,7 +583,7 @@ var _ = Describe("Watch Factory Operations", func() { testPods[name] = &opTest{pod: pod} } - h, c := addHandler(wf, podType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, PodType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { pod := obj.(*v1.Pod) ot, ok := testPods[pod.Name] @@ -651,7 +651,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newNamespace("default") - h, c := addHandler(wf, namespaceType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, NamespaceType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { ns := obj.(*v1.Namespace) Expect(reflect.DeepEqual(ns, added)).To(BeTrue()) @@ -687,7 +687,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newNode("mynode") - h, c := addHandler(wf, nodeType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, NodeType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { node := obj.(*v1.Node) Expect(reflect.DeepEqual(node, added)).To(BeTrue()) @@ -737,7 +737,7 @@ var _ = Describe("Watch Factory Operations", func() { testNodes[name] = &opTest{node: node} } - h, c := addHandler(wf, nodeType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, NodeType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { node := obj.(*v1.Node) ot, ok := testNodes[node.Name] @@ -820,7 +820,7 @@ var _ = Describe("Watch Factory Operations", func() { err = wf.Start() Expect(err).NotTo(HaveOccurred()) - h, c := addHandler(wf, nodeType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, NodeType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { defer GinkgoRecover() node := obj.(*v1.Node) @@ -914,7 +914,7 @@ var _ = Describe("Watch Factory Operations", func() { }() startWg.Wait() - h, c := addHandler(wf, namespaceType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, NamespaceType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { defer GinkgoRecover() namespace := obj.(*v1.Namespace) @@ -968,7 +968,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newPolicy("mypolicy", "default") - h, c := addHandler(wf, policyType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, PolicyType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { np := obj.(*knet.NetworkPolicy) Expect(reflect.DeepEqual(np, added)).To(BeTrue()) @@ -1004,7 +1004,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newEndpoints("myendpoints", "default") - h, c := addHandler(wf, endpointsType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, EndpointsType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { eps := obj.(*v1.Endpoints) Expect(reflect.DeepEqual(eps, added)).To(BeTrue()) @@ -1047,7 +1047,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newService("myservice", "default") - h, c := addHandler(wf, serviceType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, ServiceType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { service := obj.(*v1.Service) Expect(reflect.DeepEqual(service, added)).To(BeTrue()) @@ -1083,7 +1083,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newEgressFirewall("myEgressFirewall", "default") - h, c := addHandler(wf, egressFirewallType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, EgressFirewallType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { egressFirewall := obj.(*egressfirewall.EgressFirewall) Expect(reflect.DeepEqual(egressFirewall, added)).To(BeTrue()) @@ -1118,7 +1118,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newEgressIP("myEgressIP", "default") - h, c := addHandler(wf, egressIPType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, EgressIPType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { egressIP := obj.(*egressip.EgressIP) Expect(reflect.DeepEqual(egressIP, added)).To(BeTrue()) @@ -1153,7 +1153,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newCloudPrivateIPConfig("192.168.126.25") - h, c := addHandler(wf, cloudPrivateIPConfigType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, CloudPrivateIPConfigType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { cloudPrivateIPConfig := obj.(*ocpcloudnetworkapi.CloudPrivateIPConfig) Expect(reflect.DeepEqual(cloudPrivateIPConfig, added)).To(BeTrue()) @@ -1188,7 +1188,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) added := newNamespace("default") - h, c := addHandler(wf, namespaceType, cache.ResourceEventHandlerFuncs{ + h, c := addHandler(wf, NamespaceType, cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) {}, UpdateFunc: func(old, new interface{}) {}, DeleteFunc: func(obj interface{}) {}, @@ -1233,7 +1233,7 @@ var _ = Describe("Watch Factory Operations", func() { Expect(err).NotTo(HaveOccurred()) _, c := addFilteredHandler(wf, - podType, + PodType, "default", sel, cache.ResourceEventHandlerFuncs{ @@ -1301,7 +1301,7 @@ var _ = Describe("Watch Factory Operations", func() { equalPod := pod h, c := addFilteredHandler(wf, - podType, + PodType, "default", sel, cache.ResourceEventHandlerFuncs{ diff --git a/go-controller/pkg/factory/handler.go b/go-controller/pkg/factory/handler.go index 79bc1aaaa5..850834a764 100644 --- a/go-controller/pkg/factory/handler.go +++ b/go-controller/pkg/factory/handler.go @@ -361,23 +361,23 @@ func (i *informer) shutdown() { func newInformerLister(oType reflect.Type, sharedInformer cache.SharedIndexInformer) (listerInterface, error) { switch oType { - case podType: + case PodType: return listers.NewPodLister(sharedInformer.GetIndexer()), nil - case serviceType: + case ServiceType: return listers.NewServiceLister(sharedInformer.GetIndexer()), nil - case endpointsType: + case EndpointsType: return listers.NewEndpointsLister(sharedInformer.GetIndexer()), nil - case namespaceType: + case NamespaceType: return listers.NewNamespaceLister(sharedInformer.GetIndexer()), nil - case nodeType: + case NodeType: return listers.NewNodeLister(sharedInformer.GetIndexer()), nil - case policyType: + case PolicyType: return netlisters.NewNetworkPolicyLister(sharedInformer.GetIndexer()), nil - case egressFirewallType: + case EgressFirewallType: return egressfirewalllister.NewEgressFirewallLister(sharedInformer.GetIndexer()), nil - case egressIPType: + case EgressIPType: return egressiplister.NewEgressIPLister(sharedInformer.GetIndexer()), nil - case cloudPrivateIPConfigType: + case CloudPrivateIPConfigType: return cloudprivateipconfiglister.NewCloudPrivateIPConfigLister(sharedInformer.GetIndexer()), nil } diff --git a/go-controller/pkg/ovn/gress_policy.go b/go-controller/pkg/ovn/gress_policy.go index 9098b3ea1e..965b22ac2d 100644 --- a/go-controller/pkg/ovn/gress_policy.go +++ b/go-controller/pkg/ovn/gress_policy.go @@ -247,7 +247,8 @@ func (gp *gressPolicy) getMatchFromIPBlock(lportMatch, l4Match string) []string } // addNamespaceAddressSet adds a namespace address set to the gress policy -// if it does not exist and returns `false` if it does. +// if the address set does not exist and returns `true`; if the address set already exists, +// it returns `false`. func (gp *gressPolicy) addNamespaceAddressSet(name string) bool { v4HashName, v6HashName := addressset.MakeAddressSetHashNames(name) v4HashName = "$" + v4HashName diff --git a/go-controller/pkg/ovn/master.go b/go-controller/pkg/ovn/master.go index 2d0186c156..86a187f58d 100644 --- a/go-controller/pkg/ovn/master.go +++ b/go-controller/pkg/ovn/master.go @@ -3,8 +3,6 @@ package ovn import ( "context" "fmt" - "k8s.io/apimachinery/pkg/fields" - "math/rand" "net" "os" "strings" @@ -12,8 +10,8 @@ import ( "time" kapi "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" kerrors "k8s.io/apimachinery/pkg/util/errors" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" @@ -1395,12 +1393,6 @@ func (oc *Controller) syncWithRetry(syncName string, syncFunc func() error) { // watchNodes() will be called for all existing nodes at startup anyway. // Note that this list will include the 'join' cluster switch, which we // do not want to delete. -func (oc *Controller) syncNodes(nodes []interface{}) { - oc.syncWithRetry("syncNodes", func() error { return oc.syncNodesRetriable(nodes) }) -} - -// This function implements the main body of work of what is described by syncNodes. -// Upon failure, it may be invoked multiple times in order to avoid a pod restart. func (oc *Controller) syncNodesRetriable(nodes []interface{}) error { foundNodes := sets.NewString() for _, tmp := range nodes { @@ -1570,8 +1562,13 @@ func (oc *Controller) addUpdateNodeEvent(node *kapi.Node, nSyncs *nodeSyncs) err if err != nil { klog.Errorf("Unable to list existing pods on node: %s, existing pods on this node may not function") } else { - oc.addRetryPods(pods.Items) - oc.requestRetryPods() + klog.V(5).Infof("When adding node %s, found %d pods to add to retryPods", node.Name, len(pods.Items)) + for _, pod := range pods.Items { + pod := pod + klog.V(5).Infof("Adding pod %s/%s to retryPods", pod.Namespace, pod.Name) + oc.retryPods.addRetryObj(&pod) + } + oc.retryPods.requestRetryObjs() } if len(errs) == 0 { @@ -1595,79 +1592,3 @@ func (oc *Controller) deleteNodeEvent(node *kapi.Node) error { oc.nodeClusterRouterPortFailed.Delete(node.Name) return nil } - -// iterateRetryNodes checks if any outstanding Nodes exists -// then tries to re-add them if so -// updateAll forces all nodes to be attempted to be retried regardless -func (oc *Controller) iterateRetryNodes(updateAll bool) { - oc.retryNodes.retryMutex.Lock() - defer oc.retryNodes.retryMutex.Unlock() - now := time.Now() - for nodeName, entry := range oc.retryNodes.entries { - if entry.ignore { - // neither addition nor deletion is being retried - continue - } - // check if we need to create - if entry.newObj != nil { - n := entry.newObj.(*kapi.Node) - kNode, err := oc.watchFactory.GetNode(n.Name) - if err != nil && errors.IsNotFound(err) { - klog.Infof("%s node not found in the informers cache, not going to retry node setup", nodeName) - entry.newObj = nil - } else { - entry.newObj = kNode - } - } - - entry.backoffSec = entry.backoffSec * 2 - if entry.backoffSec > 60 { - entry.backoffSec = 60 - } - backoff := (entry.backoffSec * time.Second) + (time.Duration(rand.Intn(500)) * time.Millisecond) - nTimer := entry.timeStamp.Add(backoff) - if updateAll || now.After(nTimer) { - klog.Infof("Node Retry: %s retry node setup", nodeName) - - // check if we need to delete - if entry.oldObj != nil { - klog.Infof("Node Retry: Removing old Node for %s", nodeName) - node := entry.oldObj.(*kapi.Node) - if err := oc.deleteNodeEvent(node); err != nil { - klog.Infof("Node Retry delete failed for %s, will try again later: %v", - nodeName, err) - entry.timeStamp = time.Now() - continue - } - // successfully cleaned up old node, remove it from the retry cache - entry.oldObj = nil - } - - // create new node if needed - if entry.newObj != nil { - klog.Infof("Node Retry: Creating new node for %s", nodeName) - _, nodeSync := oc.addNodeFailed.Load(nodeName) - _, clusterRtrSync := oc.nodeClusterRouterPortFailed.Load(nodeName) - _, mgmtSync := oc.mgmtPortFailed.Load(nodeName) - _, gwSync := oc.gatewaysFailed.Load(nodeName) - if err := oc.addUpdateNodeEvent(entry.newObj.(*kapi.Node), - &nodeSyncs{nodeSync, - clusterRtrSync, - mgmtSync, - gwSync}); err != nil { - klog.Infof("Node Retry create failed for %s, will try again later: %v", - nodeName, err) - entry.timeStamp = time.Now() - continue - } - // successfully create node, remove it from the retry cache - entry.newObj = nil - } - - klog.Infof("Node Retry successful for %s", nodeName) - oc.retryNodes.deleteRetryObj(nodeName, false) - } else { - klog.V(5).Infof("%s retry node not after timer yet, time: %s", nodeName, nTimer) - } - } -} diff --git a/go-controller/pkg/ovn/namespace.go b/go-controller/pkg/ovn/namespace.go index 9dfe9e67b0..d823e5ed26 100644 --- a/go-controller/pkg/ovn/namespace.go +++ b/go-controller/pkg/ovn/namespace.go @@ -419,16 +419,16 @@ func (oc *Controller) deleteNamespace(ns *kapi.Namespace) { klog.V(5).Infof("Deleting Namespace's NetworkPolicy entities") for _, np := range nsInfo.networkPolicies { key := getPolicyNamespacedName(np.policy) - oc.retryNetPolices.skipRetryObj(key) + oc.retryNetworkPolicies.skipRetryObj(key) // add the full np object to the retry entry, since the namespace is going to be removed // along with any mappings of nsInfo -> network policies - oc.retryNetPolices.initRetryObjWithDelete(np.policy, key, np) + oc.retryNetworkPolicies.initRetryObjWithDelete(np.policy, key, np) isLastPolicyInNamespace := len(nsInfo.networkPolicies) == 1 if err := oc.destroyNetworkPolicy(np, isLastPolicyInNamespace); err != nil { klog.Errorf("Failed to delete network policy: %s, error: %v", key, err) - oc.retryNetPolices.unSkipRetryObj(key) + oc.retryNetworkPolicies.unSkipRetryObj(key) } else { - oc.retryNetPolices.deleteRetryObj(key, true) + oc.retryNetworkPolicies.deleteRetryObj(key, true) delete(nsInfo.networkPolicies, np.name) } } @@ -496,7 +496,7 @@ func (oc *Controller) ensureNamespaceLocked(ns string, readOnly bool, namespace oc.namespaces[ns] = nsInfo } else { nsInfoExisted = true - // if we found and existing nsInfo, do not hold the namespaces lock + // if we found an existing nsInfo, do not hold the namespaces lock // while waiting for nsInfo to Lock oc.namespacesMutex.Unlock() } diff --git a/go-controller/pkg/ovn/obj_retry.go b/go-controller/pkg/ovn/obj_retry.go index 4761fe28eb..51568ed5d7 100644 --- a/go-controller/pkg/ovn/obj_retry.go +++ b/go-controller/pkg/ovn/obj_retry.go @@ -1,9 +1,25 @@ package ovn import ( + "fmt" + "math/rand" + "net" + "reflect" + "strings" + "sync" "time" + kapi "k8s.io/api/core/v1" + knet "k8s.io/api/networking/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + kerrorsutil "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" + + factory "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" ) const retryObjInterval = 30 * time.Second @@ -24,6 +40,57 @@ type retryObjEntry struct { ignore bool } +type retryObjs struct { + retryMutex sync.Mutex + // cache to hold object needs retry to successfully complete processing + entries map[string]*retryObjEntry + // resource type for these objects + oType reflect.Type + // channel to indicate we need to retry objs immediately + retryChan chan struct{} + // namespace filter fed to the handler for this resource type + namespaceForFilteredHandler string + // label selector fed to the handler for this resource type + labelSelectorForFilteredHandler labels.Selector + // sync function for the handler + syncFunc func([]interface{}) error + // extra parameters needed by specific types, for now + // in use by network policy dynamic handlers + extraParameters interface{} +} + +// NewRetryObjs returns a new retryObjs instance, packed with the desired input parameters. +// The returned struct is essential for watchResource and the whole retry logic. +func NewRetryObjs( + objectType reflect.Type, + namespaceForFilteredHandler string, + labelSelectorForFilteredHandler labels.Selector, + syncFunc func([]interface{}) error, + extraParameters interface{}) *retryObjs { + + return &retryObjs{ + retryMutex: sync.Mutex{}, + entries: make(map[string]*retryObjEntry), + retryChan: make(chan struct{}, 1), + oType: objectType, + namespaceForFilteredHandler: namespaceForFilteredHandler, + labelSelectorForFilteredHandler: labelSelectorForFilteredHandler, + syncFunc: syncFunc, + extraParameters: extraParameters, + } +} + +// addRetryObj adds an object to be retried later for an add event +func (r *retryObjs) addRetryObj(obj interface{}) { + key, err := getResourceKey(r.oType, obj) + if err != nil { + klog.Errorf("Could not get the key of %v %v: %v", r.oType, obj, err) + return + } + r.initRetryObjWithAdd(obj, key) + r.unSkipRetryObj(key) +} + // initRetryObjWithAdd tracks an object that failed to be created to potentially retry later // initially it is marked as skipped for retry loop (ignore = true) func (r *retryObjs) initRetryObjWithAdd(obj interface{}, key string) { @@ -104,14 +171,22 @@ func (r *retryObjs) skipRetryObj(key string) { } } +// checkRetryObj returns true if an entry with the given key exists, returns false otherwise. +func (r *retryObjs) checkRetryObj(key string) bool { + r.retryMutex.Lock() + defer r.retryMutex.Unlock() + _, ok := r.entries[key] + return ok +} + // requestRetryObjs allows a caller to immediately request to iterate through all objects that // are in the retry cache. This will ignore any outstanding time wait/backoff state func (r *retryObjs) requestRetryObjs() { select { case r.retryChan <- struct{}{}: - klog.V(5).Infof("Iterate retry object requested") + klog.V(5).Infof("Iterate retry objects requested (resource %v)", r.oType) default: - klog.V(5).Infof("Iterate retry object already requested") + klog.V(5).Infof("Iterate retry objects already requested (resource %v)", r.oType) } } @@ -126,18 +201,984 @@ func (r *retryObjs) getObjRetryEntry(key string) *retryObjEntry { return nil } -// triggerRetryObjs track the retry objects map and every 30 seconds check if any object need to be retried -func (oc *Controller) triggerRetryObjs(retryChan chan struct{}, iterateObjs func(bool)) { - go func() { - for { - select { - case <-time.After(retryObjInterval): - iterateObjs(false) - case <-retryChan: - iterateObjs(true) - case <-oc.stopChan: +var sep = "/" + +func splitNamespacedName(namespacedName string) (string, string) { + if strings.Contains(namespacedName, sep) { + s := strings.SplitN(namespacedName, sep, 2) + if len(s) == 2 { + return s[0], s[1] + } + } + return namespacedName, "" +} + +func getNamespacedName(namespace, name string) string { + return namespace + sep + name +} + +// hasResourceAnUpdateFunc returns true if the given resource type has a dedicated update function. +// It returns false if, upon an update event on this resource type, we instead need to first delete the old +// object and then add the new one. +func hasResourceAnUpdateFunc(objType reflect.Type) bool { + switch objType { + case factory.PodType, + factory.NodeType, + factory.PeerPodSelectorType, + factory.PeerPodForNamespaceAndPodSelectorType, + factory.LocalPodSelectorType: + return true + } + return false +} + +// areResourcesEqual returns true if, given two objects of a known resource type, the update logic for this resource +// type considers them equal and therefore no update is needed. It returns false when the two objects are not considered +// equal and an update needs be executed. This is regardless of how the update is carried out (whether with a dedicated update +// function or with a delete on the old obj followed by an add on the new obj). +func areResourcesEqual(objType reflect.Type, obj1, obj2 interface{}) (bool, error) { + // switch based on type + switch objType { + case factory.PolicyType: + np1, ok := obj1.(*knet.NetworkPolicy) + if !ok { + return false, fmt.Errorf("could not cast obj1 of type interface{} to *knet.NetworkPolicy") + } + np2, ok := obj2.(*knet.NetworkPolicy) + if !ok { + return false, fmt.Errorf("could not cast obj2 of type interface{} to *knet.NetworkPolicy") + } + return reflect.DeepEqual(np1, np2), nil + + case factory.NodeType: + node1, ok := obj1.(*kapi.Node) + if !ok { + return false, fmt.Errorf("could not cast obj1 of type interface{} to *kapi.Node") + } + node2, ok := obj2.(*kapi.Node) + if !ok { + return false, fmt.Errorf("could not cast obj2 of type interface{} to *kapi.Node") + } + + // when shouldUpdate is false, the hostsubnet is not assigned by ovn-kubernetes + shouldUpdate, err := shouldUpdate(node2, node1) + if err != nil { + klog.Errorf(err.Error()) + } + return !shouldUpdate, nil + + case factory.PeerServiceType: + service1, ok := obj1.(*kapi.Service) + if !ok { + return false, fmt.Errorf("could not cast obj1 of type interface{} to *kapi.Service") + } + service2, ok := obj2.(*kapi.Service) + if !ok { + return false, fmt.Errorf("could not cast obj2 of type interface{} to *kapi.Service") + } + areEqual := reflect.DeepEqual(service1.Spec.ExternalIPs, service2.Spec.ExternalIPs) && + reflect.DeepEqual(service1.Spec.ClusterIP, service2.Spec.ClusterIP) && + reflect.DeepEqual(service1.Spec.ClusterIPs, service2.Spec.ClusterIPs) && + reflect.DeepEqual(service1.Spec.Type, service2.Spec.Type) && + reflect.DeepEqual(service1.Status.LoadBalancer.Ingress, service2.Status.LoadBalancer.Ingress) + return areEqual, nil + + case factory.PodType, + factory.PeerPodSelectorType, + factory.PeerPodForNamespaceAndPodSelectorType, + factory.LocalPodSelectorType: + // For these types, there was no old vs new obj comparison in the original update code, + // so pretend they're always different so that the update code gets executed + return false, nil + + case factory.PeerNamespaceSelectorType, + factory.PeerNamespaceAndPodSelectorType: + // For these types there is no update code, so pretend old and new + // objs are always equivalent and stop processing the update event. + return true, nil + } + + return false, fmt.Errorf("no object comparison for type %v", objType) +} + +// Given an object and its type, it returns the key for this object and an error if the key retrieval failed. +// For all namespaced resources, the key will be namespace/name. For resource types without a namespace, +// the key will be the object name itself. +func getResourceKey(objType reflect.Type, obj interface{}) (string, error) { + switch objType { + case factory.PolicyType: + np, ok := obj.(*knet.NetworkPolicy) + if !ok { + return "", fmt.Errorf("could not cast interface{} object to *knet.NetworkPolicy") + } + return getPolicyNamespacedName(np), nil + + case factory.NodeType: + node, ok := obj.(*kapi.Node) + if !ok { + return "", fmt.Errorf("could not cast interface{} object to *kapi.Node") + } + return node.Name, nil + + case factory.PeerServiceType: + service, ok := obj.(*kapi.Service) + if !ok { + return "", fmt.Errorf("could not cast interface{} object to *kapi.Service") + } + return getNamespacedName(service.Namespace, service.Name), nil + + case factory.PodType, + factory.PeerPodSelectorType, + factory.PeerPodForNamespaceAndPodSelectorType, + factory.LocalPodSelectorType: + pod, ok := obj.(*kapi.Pod) + if !ok { + return "", fmt.Errorf("could not cast interface{} object to *kapi.Pod") + } + return getNamespacedName(pod.Namespace, pod.Name), nil + + case factory.PeerNamespaceAndPodSelectorType, + factory.PeerNamespaceSelectorType: + namespace, ok := obj.(*kapi.Namespace) + if !ok { + return "", fmt.Errorf("could not cast interface{} object to *kapi.Namespace") + } + return namespace.Name, nil + } + + return "", fmt.Errorf("object type %v not supported", objType) +} + +func (oc *Controller) getPortInfo(pod *kapi.Pod) *lpInfo { + var portInfo *lpInfo + key := util.GetLogicalPortName(pod.Namespace, pod.Name) + if !util.PodWantsNetwork(pod) { + // create dummy logicalPortInfo for host-networked pods + mac, _ := net.ParseMAC("00:00:00:00:00:00") + portInfo = &lpInfo{ + logicalSwitch: "host-networked", + name: key, + uuid: "host-networked", + ips: []*net.IPNet{}, + mac: mac, + } + } else { + portInfo, _ = oc.logicalPortCache.get(key) + } + return portInfo +} + +// Given an object and its type, getInternalCacheEntry returns the internal cache entry for this object. +// This is now used only for pods, which will get their the logical port cache entry. +func (oc *Controller) getInternalCacheEntry(objType reflect.Type, obj interface{}) interface{} { + switch objType { + case factory.PodType: + pod := obj.(*kapi.Pod) + return oc.getPortInfo(pod) + default: + return nil + } +} + +// Given an object key and its type, getResourceFromInformerCache returns the latest state of the object +// from the informers cache. +func (oc *Controller) getResourceFromInformerCache(objType reflect.Type, key string) (interface{}, error) { + var obj interface{} + var err error + + switch objType { + case factory.PolicyType: + namespace, name := splitNamespacedName(key) + obj, err = oc.watchFactory.GetNetworkPolicy(namespace, name) + + case factory.NodeType: + obj, err = oc.watchFactory.GetNode(key) + + case factory.PeerServiceType: + namespace, name := splitNamespacedName(key) + obj, err = oc.watchFactory.GetService(namespace, name) + + case factory.PodType, + factory.PeerPodSelectorType, + factory.PeerPodForNamespaceAndPodSelectorType, + factory.LocalPodSelectorType: + namespace, name := splitNamespacedName(key) + obj, err = oc.watchFactory.GetPod(namespace, name) + + case factory.PeerNamespaceAndPodSelectorType, + factory.PeerNamespaceSelectorType: + obj, err = oc.watchFactory.GetNamespace(key) + + default: + err = fmt.Errorf("object type %v not supported, cannot retrieve it from informers cache", + objType) + } + return obj, err +} + +// Given an object and its type, recordAddEvent records the add event on this object. Only used for pods now. +func (oc *Controller) recordAddEvent(objType reflect.Type, obj interface{}) { + switch objType { + case factory.PodType: + klog.V(5).Infof("Recording add event on pod") + pod := obj.(*kapi.Pod) + oc.metricsRecorder.AddPod(pod.UID) + } +} + +// Given an object and its type, recordDeleteEvent records the delete event on this object. Only used for pods now. +func (oc *Controller) recordDeleteEvent(objType reflect.Type, obj interface{}) { + switch objType { + case factory.PodType: + klog.V(5).Infof("Recording add event on pod") + pod := obj.(*kapi.Pod) + oc.metricsRecorder.CleanPod(pod.UID) + } +} + +// Given an object and its type, recordErrorEvent records an error event on this object. Only used for pods now. +func (oc *Controller) recordErrorEvent(objType reflect.Type, obj interface{}, err error) { + switch objType { + case factory.PodType: + klog.V(5).Infof("Recording error event on pod") + pod := obj.(*kapi.Pod) + oc.metricsRecorder.AddPod(pod.UID) + oc.recordPodEvent(err, pod) + } +} + +// Given an object and its type, isResourceScheduled returns true if the object has been scheduled. +// Only applied to pods for now. Returns true for all other types. +func isResourceScheduled(objType reflect.Type, obj interface{}) bool { + switch objType { + case factory.PodType: + pod := obj.(*kapi.Pod) + return util.PodScheduled(pod) + } + return true +} + +// Given a *retryObjs instance, an object to add and a boolean specifying if the function was executed from +// iterateRetryResources, addResource adds the specified object to the cluster according to its type and +// returns the error, if any, yielded during object creation. +func (oc *Controller) addResource(objectsToRetry *retryObjs, obj interface{}, fromRetryLoop bool) error { + var err error + + switch objectsToRetry.oType { + case factory.PodType: + pod, ok := obj.(*kapi.Pod) + if !ok { + return fmt.Errorf("could not cast interface{} object to *knet.Pod") + } + return oc.ensurePod(nil, pod, true) + + case factory.PolicyType: + np, ok := obj.(*knet.NetworkPolicy) + if !ok { + return fmt.Errorf("could not cast interface{} object to *knet.NetworkPolicy") + } + if err = oc.addNetworkPolicy(np); err != nil { + klog.Infof("Network Policy retry delete failed for %s/%s, will try again later: %v", + np.Namespace, np.Name, err) + return err + } + + case factory.NodeType: + node, ok := obj.(*kapi.Node) + if !ok { + return fmt.Errorf("could not cast interface{} object to *kapi.Node") + } + var nodeParams *nodeSyncs + if fromRetryLoop { + _, nodeSync := oc.addNodeFailed.Load(node.Name) + _, clusterRtrSync := oc.nodeClusterRouterPortFailed.Load(node.Name) + _, mgmtSync := oc.mgmtPortFailed.Load(node.Name) + _, gwSync := oc.gatewaysFailed.Load(node.Name) + nodeParams = &nodeSyncs{ + nodeSync, + clusterRtrSync, + mgmtSync, + gwSync} + } else { + nodeParams = &nodeSyncs{true, true, true, true} + } + + if err = oc.addUpdateNodeEvent(node, nodeParams); err != nil { + klog.Infof("Node retry delete failed for %s, will try again later: %v", + node.Name, err) + return err + } + + case factory.PeerServiceType: + service, ok := obj.(*kapi.Service) + if !ok { + return fmt.Errorf("could not cast peer service of type interface{} to *kapi.Service") + } + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handlePeerServiceAdd(extraParameters.gp, service) + + case factory.PeerPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handlePeerPodSelectorAddUpdate(extraParameters.gp, obj) + + case factory.PeerNamespaceAndPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + namespace := obj.(*kapi.Namespace) + extraParameters.np.RLock() + alreadyDeleted := extraParameters.np.deleted + extraParameters.np.RUnlock() + if alreadyDeleted { + return nil + } + + // start watching pods in this namespace and selected by the label selector in extraParameters.podSelector + retryPeerPods := NewRetryObjs( + factory.PeerPodForNamespaceAndPodSelectorType, + namespace.Name, + extraParameters.podSelector, + nil, + &NetworkPolicyExtraParameters{gp: extraParameters.gp}, + ) + // The AddFilteredPodHandler call might call handlePeerPodSelectorAddUpdate + // on existing pods so we can't be holding the lock at this point + podHandler := oc.WatchResource(retryPeerPods) + + extraParameters.np.Lock() + defer extraParameters.np.Unlock() + if extraParameters.np.deleted { + oc.watchFactory.RemovePodHandler(podHandler) + return nil + } + extraParameters.np.podHandlerList = append(extraParameters.np.podHandlerList, podHandler) + + case factory.PeerPodForNamespaceAndPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handlePeerPodSelectorAddUpdate(extraParameters.gp, obj) + + case factory.PeerNamespaceSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + namespace := obj.(*kapi.Namespace) + // Update the ACL ... + return oc.handlePeerNamespaceSelectorOnUpdate(extraParameters.np, extraParameters.gp, func() bool { + // ... on condition that the added address set was not already in the 'gress policy + return extraParameters.gp.addNamespaceAddressSet(namespace.Name) + }) + + case factory.LocalPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handleLocalPodSelectorAddFunc( + extraParameters.policy, + extraParameters.np, + extraParameters.portGroupIngressDenyName, + extraParameters.portGroupEgressDenyName, + obj) + + default: + return fmt.Errorf("no add function for object type %v", objectsToRetry.oType) + } + + return nil +} + +// Given a *retryObjs instance, an old and a new object, updateResource updates the specified object in the cluster +// to its version in newObj according to its type and returns the error, if any, yielded during the object update. +func (oc *Controller) updateResource(objectsToRetry *retryObjs, oldObj, newObj interface{}) error { + switch objectsToRetry.oType { + case factory.PodType: + oldPod := oldObj.(*kapi.Pod) + newPod := newObj.(*kapi.Pod) + newKey, err := getResourceKey(objectsToRetry.oType, newObj) + if err != nil { + return err + } + return oc.ensurePod(oldPod, newPod, objectsToRetry.checkRetryObj(newKey)) + + case factory.NodeType: + newNode, ok := newObj.(*kapi.Node) + if !ok { + return fmt.Errorf("could not cast newObj of type interface{} to *kapi.Node") + } + oldNode, ok := oldObj.(*kapi.Node) + if !ok { + return fmt.Errorf("could not cast oldObj of type interface{} to *kapi.Node") + } + // determine what actually changed in this update + _, nodeSync := oc.addNodeFailed.Load(newNode.Name) + _, failed := oc.nodeClusterRouterPortFailed.Load(newNode.Name) + clusterRtrSync := failed || nodeChassisChanged(oldNode, newNode) || nodeSubnetChanged(oldNode, newNode) + _, failed = oc.mgmtPortFailed.Load(newNode.Name) + mgmtSync := failed || macAddressChanged(oldNode, newNode) || nodeSubnetChanged(oldNode, newNode) + _, failed = oc.gatewaysFailed.Load(newNode.Name) + gwSync := (failed || gatewayChanged(oldNode, newNode) || + nodeSubnetChanged(oldNode, newNode) || hostAddressesChanged(oldNode, newNode)) + + return oc.addUpdateNodeEvent(newNode, &nodeSyncs{nodeSync, clusterRtrSync, mgmtSync, gwSync}) + + case factory.PeerPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handlePeerPodSelectorAddUpdate(extraParameters.gp, newObj) + + case factory.PeerPodForNamespaceAndPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handlePeerPodSelectorAddUpdate(extraParameters.gp, newObj) + + case factory.LocalPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handleLocalPodSelectorAddFunc( + extraParameters.policy, + extraParameters.np, + extraParameters.portGroupIngressDenyName, + extraParameters.portGroupEgressDenyName, + newObj) + } + + return fmt.Errorf("no update function for object type %v", objectsToRetry.oType) +} + +// Given a *retryObjs instance, an object and optionally a cachedObj, deleteResource deletes the object from the cluster +// according to the delete logic of its resource type. cachedObj is the internal cache entry for this object, +// used for now for pods and network policies. +func (oc *Controller) deleteResource(objectsToRetry *retryObjs, obj, cachedObj interface{}) error { + switch objectsToRetry.oType { + case factory.PodType: + var portInfo *lpInfo + pod := obj.(*kapi.Pod) + if cachedObj != nil { + portInfo = cachedObj.(*lpInfo) + } + oc.logicalPortCache.remove(util.GetLogicalPortName(pod.Namespace, pod.Name)) + return oc.removePod(pod, portInfo) + + case factory.PolicyType: + var cachedNP *networkPolicy + knp, ok := obj.(*knet.NetworkPolicy) + if !ok { + return fmt.Errorf("could not cast obj of type interface{} to *knet.NetworkPolicy") + } + if cachedObj != nil { + if cachedNP, ok = cachedObj.(*networkPolicy); !ok { + cachedNP = nil + } + } + return oc.deleteNetworkPolicy(knp, cachedNP) + + case factory.NodeType: + node, ok := obj.(*kapi.Node) + if !ok { + return fmt.Errorf("could not cast obj of type interface{} to *knet.Node") + } + return oc.deleteNodeEvent(node) + + case factory.PeerServiceType: + service, ok := obj.(*kapi.Service) + if !ok { + return fmt.Errorf("could not cast peer service of type interface{} to *kapi.Service") + } + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handlePeerServiceDelete(extraParameters.gp, service) + + case factory.PeerPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handlePeerPodSelectorDelete(extraParameters.gp, obj) + + case factory.PeerNamespaceAndPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + // when the namespace labels no longer apply + // remove the namespaces pods from the address_set + var errs []error + namespace := obj.(*kapi.Namespace) + pods, _ := oc.watchFactory.GetPods(namespace.Name) + + for _, pod := range pods { + if err := oc.handlePeerPodSelectorDelete(extraParameters.gp, pod); err != nil { + errs = append(errs, err) + } + } + return kerrorsutil.NewAggregate(errs) + + case factory.PeerPodForNamespaceAndPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handlePeerPodSelectorDelete(extraParameters.gp, obj) + + case factory.PeerNamespaceSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + namespace := obj.(*kapi.Namespace) + // Remove namespace address set from the *gress policy in cache + // (done in gress.delNamespaceAddressSet()), and then update ACLs + return oc.handlePeerNamespaceSelectorOnUpdate(extraParameters.np, extraParameters.gp, func() bool { + // ... on condition that the removed address set was in the 'gress policy + return extraParameters.gp.delNamespaceAddressSet(namespace.Name) + }) + + case factory.PeerPodForNamespaceAndPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handleLocalPodSelectorDelFunc( + extraParameters.policy, + extraParameters.np, + extraParameters.portGroupIngressDenyName, + extraParameters.portGroupEgressDenyName, + obj) + + case factory.LocalPodSelectorType: + extraParameters := objectsToRetry.extraParameters.(*NetworkPolicyExtraParameters) + return oc.handleLocalPodSelectorDelFunc( + extraParameters.policy, + extraParameters.np, + extraParameters.portGroupIngressDenyName, + extraParameters.portGroupEgressDenyName, + obj) + + default: + return fmt.Errorf("object type %v not supported", objectsToRetry.oType) + } +} + +// iterateRetryResources checks if any outstanding resource objects exist and if so it tries to +// re-add them. updateAll forces all objects to be attempted to be retried regardless. +func (oc *Controller) iterateRetryResources(r *retryObjs, updateAll bool) { + r.retryMutex.Lock() + defer r.retryMutex.Unlock() + now := time.Now() + for objKey, entry := range r.entries { + if entry.ignore { + continue + } + // check if we need to create the resource object + if entry.newObj != nil { + // get the latest version of the resource object from the informer; + // if it doesn't exist we are not going to create the new object. + obj, err := oc.getResourceFromInformerCache(r.oType, objKey) + if err != nil { + if kerrors.IsNotFound(err) { + klog.Infof("%v %s not found in the informers cache,"+ + " not going to retry object create", r.oType, objKey) + entry.newObj = nil + } else { + klog.Errorf("Failed to look up %v %s in the informers cache,"+ + " will retry later: %v", r.oType, objKey, err) + continue + } + } else { + entry.newObj = obj + } + } + + entry.backoffSec = entry.backoffSec * 2 + if entry.backoffSec > 60 { + entry.backoffSec = 60 + } + backoff := (entry.backoffSec * time.Second) + (time.Duration(rand.Intn(500)) * time.Millisecond) + objTimer := entry.timeStamp.Add(backoff) + if !updateAll && now.Before(objTimer) { + klog.V(5).Infof("%v retry %s not after timer yet, time: %s", r.oType, objKey, objTimer) + continue + } + + klog.Infof("%v %s: retry object setup", r.oType, objKey) + + // delete old object if needed + if entry.oldObj != nil { + klog.Infof("%v retry: removing old object for %s", r.oType, objKey) + if !isResourceScheduled(r.oType, entry.oldObj) { + klog.V(5).Infof("Retry: %s %s not scheduled", r.oType, objKey) + continue + } + if err := oc.deleteResource(r, entry.oldObj, entry.config); err != nil { + klog.Infof("%v retry delete failed for %s, will try again later: %v", r.oType, objKey, err) + entry.timeStamp = time.Now() + continue + } + // successfully cleaned up old object, remove it from the retry cache + entry.oldObj = nil + } + + // create new object if needed + if entry.newObj != nil { + klog.Infof("%v retry: creating object for %s", r.oType, objKey) + if !isResourceScheduled(r.oType, entry.newObj) { + klog.V(5).Infof("Retry: %s %s not scheduled", r.oType, objKey) + continue + } + if err := oc.addResource(r, entry.newObj, true); err != nil { + klog.Infof("%v retry create failed for %s, will try again later: %v", r.oType, objKey, err) + entry.timeStamp = time.Now() + continue + } + // successfully cleaned up old object, remove it from the retry cache + entry.newObj = nil + } + + klog.Infof("%v retry successful for %s", r.oType, objKey) + r.deleteRetryObj(objKey, false) + + } +} + +// periodicallyRetryResources tracks retryObjs and checks if any object needs to be retried for add or delete every +// retryObjInterval seconds or when requested through retryChan. +func (oc *Controller) periodicallyRetryResources(r *retryObjs) { + for { + select { + case <-time.After(retryObjInterval): + klog.V(5).Infof("%s s have elapsed, retrying failed objects of type %v", retryObjInterval, r.oType) + oc.iterateRetryResources(r, false) + + case <-r.retryChan: + klog.V(5).Infof("Retry channel got triggered: retrying failed objects of type %v", r.oType) + oc.iterateRetryResources(r, true) + + case <-oc.stopChan: + klog.V(5).Infof("Stop channel got triggered: will stop retrying failed objects of type %v", r.oType) + return + } + } +} + +// Given a *retryObjs instance, getSyncResourcesFunc retuns the sync function for a given resource type. +// This will be then called on all existing objects when a watcher is started. +func (oc *Controller) getSyncResourcesFunc(r *retryObjs) (func([]interface{}), error) { + + var syncRetriableFunc func([]interface{}) error + var syncFunc func([]interface{}) + var name string + + // If a type needs a retriable sync funcion, it will set syncRetriableFunc + // and will keep syncFunc=nil. For a non-retriable sync func, + // it will directly set syncFunc. + switch r.oType { + case factory.PodType: + name = "SyncPods" + syncRetriableFunc = oc.syncPodsRetriable + + case factory.PolicyType: + name = "syncNetworkPolicies" + syncRetriableFunc = oc.syncNetworkPolicies + + case factory.NodeType: + name = "syncNodes" + syncRetriableFunc = oc.syncNodesRetriable + + case factory.PeerServiceType, + factory.PeerNamespaceAndPodSelectorType: + name = "" + syncRetriableFunc = nil + + case factory.PeerPodSelectorType: + name = "PeerPodSelector" + extraParameters := r.extraParameters.(*NetworkPolicyExtraParameters) + syncRetriableFunc = func(objs []interface{}) error { + return oc.handlePeerPodSelectorAddUpdate(extraParameters.gp, objs...) + } + + case factory.PeerPodForNamespaceAndPodSelectorType: + name = "PeerPodForNamespaceAndPodSelector" + extraParameters := r.extraParameters.(*NetworkPolicyExtraParameters) + syncRetriableFunc = func(objs []interface{}) error { + return oc.handlePeerPodSelectorAddUpdate(extraParameters.gp, objs...) + } + + case factory.PeerNamespaceSelectorType: + name = "PeerNamespaceSelector" + extraParameters := r.extraParameters.(*NetworkPolicyExtraParameters) + // the function below will never fail, so there's no point in making it retriable... + syncFunc = func(i []interface{}) { + // This needs to be a write lock because there's no locking around 'gress policies + extraParameters.np.Lock() + defer extraParameters.np.Unlock() + // We load the existing address set into the 'gress policy. + // Notice that this will make the AddFunc for this initial + // address set a noop. + // The ACL must be set explicitly after setting up this handler + // for the address set to be considered. + extraParameters.gp.addNamespaceAddressSets(i) + } + + case factory.LocalPodSelectorType: + name = "LocalPodSelectorType" + syncRetriableFunc = r.syncFunc + + default: + return nil, fmt.Errorf("no sync function for object type %v", r.oType) + } + + if syncFunc == nil { + syncFunc = func(objects []interface{}) { + if syncRetriableFunc == nil { return } + oc.syncWithRetry(name, func() error { return syncRetriableFunc(objects) }) } - }() + } + return syncFunc, nil +} + +// Given an object and its type, isObjectInTerminalState returns true if the object is a in terminal state. +// This is used now for pods that are either in a PodSucceeded or in a PodFailed state. +func (oc *Controller) isObjectInTerminalState(objType reflect.Type, obj interface{}) bool { + switch objType { + case factory.PodType, + factory.PeerPodSelectorType, + factory.PeerPodForNamespaceAndPodSelectorType, + factory.LocalPodSelectorType: + pod := obj.(*kapi.Pod) + return util.PodCompleted(pod) + + default: + return false + } +} + +type resourceEvent string + +var ( + resourceEventAdd resourceEvent = "add" + resourceEventUpdate resourceEvent = "update" +) + +// processObjectInTerminalState is executed when an object has been added or updated and is actually in a terminal state +// already. The add or update event is not valid for such object, which we now remove from the cluster in order to +// free its resources. (for now, this applies to completed pods) +func (oc *Controller) processObjectInTerminalState(objectsToRetry *retryObjs, obj interface{}, key string, event resourceEvent) { + // The object is in a terminal state: delete it from the cluster, delete its retry entry and return. + klog.Infof("Detected object %s of type %v in terminal state (e.g. completed)"+ + " during %s event: will remove it", key, objectsToRetry.oType, event) + + internalCacheEntry := oc.getInternalCacheEntry(objectsToRetry.oType, obj) + objectsToRetry.initRetryObjWithDelete(obj, key, internalCacheEntry) // set up the retry obj for deletion + if retryEntry := objectsToRetry.getObjRetryEntry(key); retryEntry != nil { + // retryEntry shouldn't be nil since we've just added the obj to objectsToRetry + retryEntry.newObj = nil // will not be retried for addition + } + + if err := oc.deleteResource(objectsToRetry, obj, internalCacheEntry); err != nil { + klog.Errorf("Failed to delete object %s of type %v in terminal state, during %s event: %v", + key, objectsToRetry.oType, event, err) + oc.recordErrorEvent(objectsToRetry.oType, obj, err) + objectsToRetry.unSkipRetryObj(key) + return + } + objectsToRetry.removeDeleteFromRetryObj(key) + objectsToRetry.deleteRetryObj(key, true) +} + +// WatchResource starts the watching of a resource type, manages its retry entries and calls +// back the appropriate handler logic. It also starts a goroutine that goes over all retry objects +// periodically or when explicitly requested. +// Note: when applying WatchResource to a new resource type, the appropriate resource-specific logic must be added to the +// the different methods it calls. +func (oc *Controller) WatchResource(objectsToRetry *retryObjs) *factory.Handler { + // track the retry entries and every 30 seconds (or upon explicit request) check if any objects + // need to be retried + go oc.periodicallyRetryResources(objectsToRetry) + + addHandlerFunc, err := oc.watchFactory.GetResourceHandlerFunc(objectsToRetry.oType) + if err != nil { + klog.Errorf("No resource handler function found for resource %v. "+ + "Cannot watch this resource.", objectsToRetry.oType) + return nil + } + syncFunc, err := oc.getSyncResourcesFunc(objectsToRetry) + if err != nil { + klog.Errorf("No sync function found for resource %v. "+ + "Cannot watch this resource.", objectsToRetry.oType) + return nil + } + + // create the actual watcher + handler := addHandlerFunc( + objectsToRetry.namespaceForFilteredHandler, // filter out objects not in this namespace + objectsToRetry.labelSelectorForFilteredHandler, // filter out objects not matching these labels + cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + oc.recordAddEvent(objectsToRetry.oType, obj) + + key, err := getResourceKey(objectsToRetry.oType, obj) + if err != nil { + klog.Errorf("Upon add event: %v", err) + return + } + klog.Infof("Add event received for resource %v, key=%s", objectsToRetry.oType, key) + + objectsToRetry.initRetryObjWithAdd(obj, key) + objectsToRetry.skipRetryObj(key) + + // This only applies to pod watchers (pods + dynamic network policy handlers watching pods): + // if ovnkube-master is restarted, it will gets all the add events with completed pods + if oc.isObjectInTerminalState(objectsToRetry.oType, obj) { + oc.processObjectInTerminalState(objectsToRetry, obj, key, resourceEventAdd) + return + } + + // If there is a delete entry with the same key, we got an add event for an object + // with the same name as a previous object that failed deletion. + // Destroy the old object before we add the new one. + if retryEntry := objectsToRetry.getObjRetryEntry(key); retryEntry != nil && retryEntry.oldObj != nil { + klog.Infof("Detected stale object during new object"+ + " add of type %v with the same key: %s", + objectsToRetry.oType, key) + internalCacheEntry := oc.getInternalCacheEntry(objectsToRetry.oType, obj) + if err := oc.deleteResource(objectsToRetry, obj, internalCacheEntry); err != nil { + klog.Errorf("Failed to delete old object %s of type %v,"+ + " during add event: %v", key, objectsToRetry.oType, err) + oc.recordErrorEvent(objectsToRetry.oType, obj, err) + objectsToRetry.unSkipRetryObj(key) + return + } + objectsToRetry.removeDeleteFromRetryObj(key) + } + start := time.Now() + if err := oc.addResource(objectsToRetry, obj, false); err != nil { + klog.Errorf("Failed to create %v object %s, error: %v", + objectsToRetry.oType, key, err) + oc.recordErrorEvent(objectsToRetry.oType, obj, err) + objectsToRetry.unSkipRetryObj(key) + return + } + klog.Infof("Creating %v %s took: %v", objectsToRetry.oType, key, time.Since(start)) + objectsToRetry.deleteRetryObj(key, true) + }, + + UpdateFunc: func(old, newer interface{}) { + // skip the whole update if old and newer are equal + areEqual, err := areResourcesEqual(objectsToRetry.oType, old, newer) + if err != nil { + klog.Errorf("Could not compare old and newer resource objects of type %v: %v", + objectsToRetry.oType, err) + return + } + klog.V(5).Infof("Update event received for resource %s, old object is equal to new: %v", + objectsToRetry.oType, areEqual) + if areEqual { + return + } + + // get the object keys for newer and old (expected to be the same) + newKey, err := getResourceKey(objectsToRetry.oType, newer) + if err != nil { + klog.Errorf("Update of resource %v failed when looking up key of new obj: %v", + objectsToRetry.oType, err) + return + } + oldKey, err := getResourceKey(objectsToRetry.oType, old) + if err != nil { + klog.Errorf("Update of resource %v failed when looking up key of old obj: %v", + objectsToRetry.oType, err) + return + } + + // skip the whole update if the new object doesn't exist anymore in the API server + newer, err = oc.getResourceFromInformerCache(objectsToRetry.oType, newKey) + if err != nil { + klog.Warningf("Unable to get %v %s from informer cache (perhaps it was already"+ + " deleted?), skipping update: %v", objectsToRetry.oType, newKey, err) + return + } + + klog.Infof("Update event received for resource %v, oldKey=%s, newKey=%s", + objectsToRetry.oType, oldKey, newKey) + + objectsToRetry.skipRetryObj(newKey) + hasUpdateFunc := hasResourceAnUpdateFunc(objectsToRetry.oType) + + // STEP 1: + // Delete existing (old) object if: + // a) it has a retry entry marked for deletion or + // b) the resource is in terminal state (e.g. pod is completed) or + // c) this resource type has no update function, so an update means delete old obj and add new one + retryEntry := objectsToRetry.getObjRetryEntry(oldKey) + if retryEntry != nil && retryEntry.oldObj != nil { + // [step 1a] there is a retry entry marked for deletion + klog.Infof("Found old retry object for %v %s: will delete it", + objectsToRetry.oType, oldKey) + if err := oc.deleteResource(objectsToRetry, retryEntry.oldObj, + retryEntry.config); err != nil { + klog.Errorf("Failed to delete stale object %s, during update: %v", oldKey, err) + oc.recordErrorEvent(objectsToRetry.oType, retryEntry.oldObj, err) + objectsToRetry.initRetryObjWithAdd(newer, newKey) + objectsToRetry.unSkipRetryObj(oldKey) + return + } + // remove the old object from retry entry since it was correctly deleted + objectsToRetry.removeDeleteFromRetryObj(oldKey) + + } else if oc.isObjectInTerminalState(objectsToRetry.oType, newer) { // check the latest status on newer + // [step 1b] The object is in a terminal state: delete it from the cluster, + // delete its retry entry and return. This only applies to pod watchers + // (pods + dynamic network policy handlers watching pods). + oc.processObjectInTerminalState(objectsToRetry, old, oldKey, resourceEventUpdate) + return + + } else if !hasUpdateFunc { + // [step 1c] if this resource type has no update function, + // delete old obj and in step 2 add the new one + var existingCacheEntry interface{} + if retryEntry != nil { + existingCacheEntry = retryEntry.config + } + klog.Infof("Deleting old %s of type %s during update", oldKey, objectsToRetry.oType) + if err := oc.deleteResource(objectsToRetry, old, existingCacheEntry); err != nil { + klog.Errorf("Failed to delete %s %s, during update: %v", + objectsToRetry.oType, oldKey, err) + oc.recordErrorEvent(objectsToRetry.oType, old, err) + objectsToRetry.initRetryObjWithDelete(old, oldKey, nil) + objectsToRetry.initRetryObjWithAdd(newer, newKey) + objectsToRetry.unSkipRetryObj(oldKey) + return + } + // remove the old object from retry entry since it was correctly deleted + objectsToRetry.removeDeleteFromRetryObj(oldKey) + } + + // STEP 2: + // Execute the update function for this resource type; resort to add if no update + // function is available. + if hasUpdateFunc { + klog.Infof("Updating %s %s", objectsToRetry.oType, newKey) + // if this resource type has an update func, just call the update function + if err := oc.updateResource(objectsToRetry, old, newer); err != nil { + klog.Errorf("Failed to update resource %v, old=%s, new=%s, error: %v", + objectsToRetry.oType, oldKey, newKey, err) + oc.recordErrorEvent(objectsToRetry.oType, newer, err) + objectsToRetry.initRetryObjWithAdd(newer, newKey) + objectsToRetry.unSkipRetryObj(newKey) + return + } + } else { // we previously deleted old object, now let's add the new one + klog.Infof("Adding new %s of type %s", newKey, objectsToRetry.oType) + if err := oc.addResource(objectsToRetry, newer, false); err != nil { + oc.recordErrorEvent(objectsToRetry.oType, newer, err) + objectsToRetry.initRetryObjWithAdd(newer, newKey) + objectsToRetry.unSkipRetryObj(newKey) + klog.Errorf("Failed to add %s %s, during update: %v", + objectsToRetry.oType, newKey, err) + return + } + } + + objectsToRetry.deleteRetryObj(newKey, true) + + }, + DeleteFunc: func(obj interface{}) { + oc.recordDeleteEvent(objectsToRetry.oType, obj) + key, err := getResourceKey(objectsToRetry.oType, obj) + if err != nil { + klog.Errorf("Delete of resource %v failed: %v", objectsToRetry.oType, err) + return + } + klog.Infof("Delete event received for resource %v %s", objectsToRetry.oType, key) + objectsToRetry.skipRetryObj(key) + internalCacheEntry := oc.getInternalCacheEntry(objectsToRetry.oType, obj) + objectsToRetry.initRetryObjWithDelete(obj, key, internalCacheEntry) // set up the retry obj for deletion + if err := oc.deleteResource(objectsToRetry, obj, internalCacheEntry); err != nil { + objectsToRetry.unSkipRetryObj(key) + klog.Errorf("Failed to delete resource object %s of type %v, error: %v", + key, objectsToRetry.oType, err) + return + } + objectsToRetry.deleteRetryObj(key, true) + }, + }, + syncFunc) // adds all existing objects at startup + + return handler } diff --git a/go-controller/pkg/ovn/ovn.go b/go-controller/pkg/ovn/ovn.go index 8d2c14aa7b..e95bc9b1c3 100644 --- a/go-controller/pkg/ovn/ovn.go +++ b/go-controller/pkg/ovn/ovn.go @@ -34,7 +34,6 @@ import ( utilnet "k8s.io/utils/net" kapi "k8s.io/api/core/v1" - kapisnetworking "k8s.io/api/networking/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" @@ -141,11 +140,11 @@ type Controller struct { addressSetFactory addressset.AddressSetFactory // For each logical port, the number of network policies that want - // to add a ingress deny rule. + // to add an ingress deny rule. lspIngressDenyCache map[string]int // For each logical port, the number of network policies that want - // to add a egress deny rule. + // to add an egress deny rule. lspEgressDenyCache map[string]int // A mutex for lspIngressDenyCache and lspEgressDenyCache @@ -189,20 +188,15 @@ type Controller struct { // v6HostSubnetsUsed keeps track of number of v6 subnets currently assigned to nodes v6HostSubnetsUsed float64 - // Map of pods that need to be retried, and the timestamp of when they last failed - // The key is a string which holds "namespace_podName" - retryPods map[string]*retryEntry - retryPodsLock sync.Mutex - - // channel to indicate we need to retry pods immediately - retryPodsChan chan struct{} + // Objects for pods that need to be retried + retryPods *retryObjs // Objects for network policies that need to be retried - retryNetPolices *retryObjs + retryNetworkPolicies *retryObjs - // Objects for node that need to be retried + // Objects for nodes that need to be retried retryNodes *retryObjs - // Node's specific syncMap used by WatchNode event handler + // Node-specific syncMap used by node event handler gatewaysFailed sync.Map mgmtPortFailed sync.Map addNodeFailed sync.Map @@ -211,27 +205,6 @@ type Controller struct { metricsRecorder *metrics.ControlPlaneRecorder } -type retryEntry struct { - pod *kapi.Pod - timeStamp time.Time - backoffSec time.Duration - // whether to include this pod in retry iterations - ignore bool - // used to indicate if add events need to be retried - needsAdd bool - // used to indicate if delete event needs to be retried; - // this will hold a copy of its value from the oc.logicalSwitchPort cache - needsDel *lpInfo -} - -type retryObjs struct { - retryMutex sync.Mutex - // cache to hold object needs retry to successfully complete processing - entries map[string]*retryObjEntry - // channel to indicate we need to retry objs immediately - retryChan chan struct{} -} - const ( // TCP is the constant string for the string "TCP" TCP = "TCP" @@ -257,6 +230,11 @@ func GetIPFullMask(ip string) string { return IPv4FullMask } +// getPodNamespacedName returns _ for the provided pod +func getPodNamespacedName(pod *kapi.Pod) string { + return util.GetLogicalPortName(pod.Namespace, pod.Name) +} + // NewOvnController creates a new OVN controller for creating logical network // infrastructure and policy func NewOvnController(ovnClient *util.OVNClientset, wf *factory.WatchFactory, stopChan <-chan struct{}, addressSetFactory addressset.AddressSetFactory, @@ -304,25 +282,15 @@ func NewOvnController(ovnClient *util.OVNClientset, wf *factory.WatchFactory, st loadBalancerGroupUUID: "", aclLoggingEnabled: true, joinSwIPManager: nil, - retryPods: make(map[string]*retryEntry), - retryPodsChan: make(chan struct{}, 1), - retryNetPolices: &retryObjs{ - retryMutex: sync.Mutex{}, - entries: make(map[string]*retryObjEntry), - retryChan: make(chan struct{}, 1), - }, - retryNodes: &retryObjs{ - retryMutex: sync.Mutex{}, - entries: make(map[string]*retryObjEntry), - retryChan: make(chan struct{}, 1), - }, - recorder: recorder, - nbClient: libovsdbOvnNBClient, - sbClient: libovsdbOvnSBClient, - svcController: svcController, - svcFactory: svcFactory, - modelClient: modelClient, - metricsRecorder: metrics.NewControlPlaneRecorder(), + retryPods: NewRetryObjs(factory.PodType, "", nil, nil, nil), + retryNetworkPolicies: NewRetryObjs(factory.PolicyType, "", nil, nil, nil), + retryNodes: NewRetryObjs(factory.NodeType, "", nil, nil, nil), + recorder: recorder, + nbClient: libovsdbOvnNBClient, + sbClient: libovsdbOvnSBClient, + svcController: svcController, + svcFactory: svcFactory, + metricsRecorder: metrics.NewControlPlaneRecorder(), } } @@ -526,244 +494,15 @@ func (oc *Controller) removePod(pod *kapi.Pod, portInfo *lpInfo) error { return nil } -// WatchPods starts the watching of Pod resource and calls back the appropriate handler logic +// WatchPods starts the watching of the Pod resource and calls back the appropriate handler logic func (oc *Controller) WatchPods() { - oc.triggerRetryObjs(oc.retryPodsChan, oc.iterateRetryPods) - - start := time.Now() - - oc.watchFactory.AddPodHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - pod := obj.(*kapi.Pod) - oc.metricsRecorder.AddPod(pod.UID) - oc.checkAndSkipRetryPod(pod) - // in case ovnkube-master is restarted and gets all the add events with completed pods - if util.PodCompleted(pod) { - // pod is in completed state, remove it - klog.Infof("Detected completed pod: %s. Will remove.", getPodNamespacedName(pod)) - oc.initRetryDelPod(pod) - oc.removeAddRetry(pod) - oc.logicalPortCache.remove(util.GetLogicalPortName(pod.Namespace, pod.Name)) - retryEntry := oc.getPodRetryEntry(pod) - var portInfo *lpInfo - if retryEntry != nil { - // retryEntry shouldn't be nil since we usually add the pod to retryCache above - portInfo = retryEntry.needsDel - } - if err := oc.removePod(pod, portInfo); err != nil { - oc.recordPodEvent(err, pod) - klog.Errorf("Failed to delete completed pod %s, error: %v", - getPodNamespacedName(pod), err) - oc.unSkipRetryPod(pod) - return - } - oc.checkAndDeleteRetryPod(pod) - return - } - // need to add new pod - oc.initRetryAddPod(pod) - if retryEntry := oc.getPodRetryEntry(pod); retryEntry != nil && retryEntry.needsDel != nil { - klog.Infof("Detected leftover old pod during new pod add with the same name: %s. "+ - "Attempting deletion of leftover old...", getPodNamespacedName(pod)) - if err := oc.removePod(pod, retryEntry.needsDel); err != nil { - oc.recordPodEvent(err, pod) - klog.Errorf("Failed to delete pod %s, error: %v", - getPodNamespacedName(pod), err) - oc.unSkipRetryPod(pod) - return - } - // deletion was a success; remove delete retry entry - oc.removeDeleteRetry(pod) - } - if err := oc.ensurePod(nil, pod, true); err != nil { - oc.recordPodEvent(err, pod) - klog.Errorf("Failed to add pod %s, error: %v", - getPodNamespacedName(pod), err) - oc.unSkipRetryPod(pod) - return - } - oc.checkAndDeleteRetryPod(pod) - }, - UpdateFunc: func(old, newer interface{}) { - oldPod := old.(*kapi.Pod) - pod := newer.(*kapi.Pod) - // there may be a situation where this update event is not the latest - // and we rely on annotations to determine the pod mac/ifaddr - // this would create a situation where - // 1. addLogicalPort is executing with an older pod annotation, skips setting a new annotation - // 2. creates OVN logical port with old pod annotation value - // 3. CNI flows check fails and pod annotation does not match what is in OVN - // Therefore we need to get the latest version of this pod to attempt to addLogicalPort with - podName := pod.Name - podNs := pod.Namespace - pod, err := oc.watchFactory.GetPod(podNs, podName) - if err != nil { - klog.Warningf("Unable to get pod %s/%s for pod update, most likely it was already deleted", - podNs, podName) - return - } - oc.checkAndSkipRetryPod(pod) - if retryEntry := oc.getPodRetryEntry(pod); retryEntry != nil && retryEntry.needsDel != nil { - klog.Infof("Detected leftover old pod during new pod add with the same name: %s. "+ - "Attempting deletion of leftover old...", getPodNamespacedName(pod)) - if err := oc.removePod(pod, retryEntry.needsDel); err != nil { - oc.recordPodEvent(err, pod) - klog.Errorf("Failed to delete pod %s, error: %v", - getPodNamespacedName(pod), err) - oc.unSkipRetryPod(pod) - return - } - // deletion was a success; remove delete retry entry - oc.removeDeleteRetry(pod) - } else if util.PodCompleted(pod) { - // pod is in completed state, remove it - klog.Infof("Detected completed pod: %s. Will remove.", getPodNamespacedName(pod)) - oc.initRetryDelPod(pod) - oc.removeAddRetry(pod) - oc.logicalPortCache.remove(util.GetLogicalPortName(pod.Namespace, pod.Name)) - retryEntry := oc.getPodRetryEntry(pod) - var portInfo *lpInfo - if retryEntry != nil { - // retryEntry shouldn't be nil since we usually add the pod to retryCache above - portInfo = retryEntry.needsDel - } - if err := oc.removePod(pod, portInfo); err != nil { - oc.recordPodEvent(err, pod) - klog.Errorf("Failed to delete completed pod %s, error: %v", - getPodNamespacedName(pod), err) - oc.unSkipRetryPod(pod) - return - } - oc.checkAndDeleteRetryPod(pod) - return - } - - if err := oc.ensurePod(oldPod, pod, oc.checkAndSkipRetryPod(pod)); err != nil { - oc.recordPodEvent(err, pod) - klog.Errorf("Failed to update pod %s, error: %v", - getPodNamespacedName(pod), err) - oc.initRetryAddPod(pod) - // unskip failed pod for next retry iteration - oc.unSkipRetryPod(pod) - return - } - oc.checkAndDeleteRetryPod(pod) - }, - DeleteFunc: func(obj interface{}) { - pod := obj.(*kapi.Pod) - oc.metricsRecorder.CleanPod(pod.UID) - oc.initRetryDelPod(pod) - // we have a copy of portInfo in the retry cache now, we can remove it from - // logicalPortCache so that we don't race with a new add pod that comes with - // the same name. - oc.logicalPortCache.remove(util.GetLogicalPortName(pod.Namespace, pod.Name)) - retryEntry := oc.getPodRetryEntry(pod) - var portInfo *lpInfo - if retryEntry != nil { - // retryEntry shouldn't be nil since we usually add the pod to retryCache above - portInfo = retryEntry.needsDel - } - if err := oc.removePod(pod, portInfo); err != nil { - oc.recordPodEvent(err, pod) - klog.Errorf("Failed to delete pod %s, error: %v", - getPodNamespacedName(pod), err) - oc.unSkipRetryPod(pod) - return - } - oc.checkAndDeleteRetryPod(pod) - }, - }, oc.syncPods) - klog.Infof("Bootstrapping existing pods and cleaning stale pods took %v", time.Since(start)) + oc.WatchResource(oc.retryPods) } -// WatchNetworkPolicy starts the watching of network policy resource and calls +// WatchNetworkPolicy starts the watching of the network policy resource and calls // back the appropriate handler logic func (oc *Controller) WatchNetworkPolicy() { - oc.triggerRetryObjs(oc.retryNetPolices.retryChan, oc.iterateRetryNetworkPolicies) - start := time.Now() - oc.watchFactory.AddPolicyHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - policy := obj.(*kapisnetworking.NetworkPolicy) - key := getPolicyNamespacedName(policy) - oc.retryNetPolices.initRetryObjWithAdd(policy, key) - oc.retryNetPolices.skipRetryObj(key) - // If there is a delete entry this is a network policy being added - // with the same name as a previous network policy that failed deletion. - // Destroy it first before we add the new policy. - if retryEntry := oc.retryNetPolices.getObjRetryEntry(key); retryEntry != nil && retryEntry.oldObj != nil { - klog.Infof("Detected stale policy during new policy add with the same name: %s/%s", - policy.Namespace, policy.Name) - knp := retryEntry.oldObj.(*kapisnetworking.NetworkPolicy) - if err := oc.deleteNetworkPolicy(knp, nil); err != nil { - oc.retryNetPolices.unSkipRetryObj(key) - klog.Errorf("Failed to delete stale network policy %s, during add: %v", - key, err) - return - } - oc.retryNetPolices.removeDeleteFromRetryObj(key) - } - start := time.Now() - if err := oc.addNetworkPolicy(policy); err != nil { - klog.Errorf("Failed to create network policy %s, error: %v", - getPolicyNamespacedName(policy), err) - oc.retryNetPolices.unSkipRetryObj(key) - return - } - klog.Infof("Created Network Policy: %s took: %v", key, time.Since(start)) - oc.retryNetPolices.deleteRetryObj(key, true) - }, - UpdateFunc: func(old, newer interface{}) { - oldPolicy := old.(*kapisnetworking.NetworkPolicy) - newPolicy := newer.(*kapisnetworking.NetworkPolicy) - if !reflect.DeepEqual(oldPolicy, newPolicy) { - newKey := getPolicyNamespacedName(newPolicy) - oldKey := getPolicyNamespacedName(oldPolicy) - oc.retryNetPolices.skipRetryObj(oldKey) - // check if there was already a retry entry with an old policy - // else just look to delete the old policy in the update - if retryEntry := oc.retryNetPolices.getObjRetryEntry(oldKey); retryEntry != nil && retryEntry.oldObj != nil { - knp := retryEntry.oldObj.(*kapisnetworking.NetworkPolicy) - if err := oc.deleteNetworkPolicy(knp, nil); err != nil { - oc.retryNetPolices.initRetryObjWithAdd(newPolicy, newKey) - oc.retryNetPolices.unSkipRetryObj(oldKey) - klog.Errorf("Failed to delete stale network policy %s, during update: %v", - oldKey, err) - return - } - } else if err := oc.deleteNetworkPolicy(oldPolicy, nil); err != nil { - oc.retryNetPolices.initRetryObjWithDelete(oldPolicy, oldKey, nil) - oc.retryNetPolices.initRetryObjWithAdd(newPolicy, newKey) - oc.retryNetPolices.unSkipRetryObj(oldKey) - klog.Errorf("Failed to delete network policy %s, during update: %v", - oldKey, err) - return - } - // remove the old policy from retry entry since it was correctly deleted - oc.retryNetPolices.removeDeleteFromRetryObj(oldKey) - if err := oc.addNetworkPolicy(newPolicy); err != nil { - oc.retryNetPolices.initRetryObjWithAdd(newPolicy, newKey) - oc.retryNetPolices.unSkipRetryObj(newKey) - klog.Errorf("Failed to create network policy %s, during update: %v", - newKey, err) - return - } - oc.retryNetPolices.deleteRetryObj(newKey, true) - } - }, - DeleteFunc: func(obj interface{}) { - policy := obj.(*kapisnetworking.NetworkPolicy) - key := getPolicyNamespacedName(policy) - oc.retryNetPolices.skipRetryObj(key) - oc.retryNetPolices.initRetryObjWithDelete(policy, key, nil) - if err := oc.deleteNetworkPolicy(policy, nil); err != nil { - oc.retryNetPolices.unSkipRetryObj(key) - klog.Errorf("Failed to delete network policy %s, error: %v", getPolicyNamespacedName(policy), err) - return - } - oc.retryNetPolices.deleteRetryObj(key, true) - }, - }, oc.syncNetworkPolicies) - klog.Infof("Bootstrapping existing policies and cleaning stale policies took %v", time.Since(start)) + oc.WatchResource(oc.retryNetworkPolicies) } // WatchEgressFirewall starts the watching of egressfirewall resource and calls @@ -1093,92 +832,7 @@ func (oc *Controller) syncNodeGateway(node *kapi.Node, hostSubnets []*net.IPNet) // WatchNodes starts the watching of node resource and calls // back the appropriate handler logic func (oc *Controller) WatchNodes() { - oc.triggerRetryObjs(oc.retryNodes.retryChan, oc.iterateRetryNodes) - start := time.Now() - oc.watchFactory.AddNodeHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - node := obj.(*kapi.Node) - oc.retryNodes.initRetryObjWithAdd(node, node.Name) - oc.retryNodes.skipRetryObj(node.Name) - if retryEntry := oc.retryNodes.getObjRetryEntry(node.Name); retryEntry != nil && retryEntry.oldObj != nil { - klog.Infof("Detected leftover old node during new node add %s.", node.Name) - if err := oc.deleteNodeEvent(node); err != nil { - oc.retryNodes.unSkipRetryObj(node.Name) - klog.Errorf("Failed to delete node %s, error: %v", - node.Name, err) - return - } - oc.retryNodes.removeDeleteFromRetryObj(node.Name) - } - start := time.Now() - if err := oc.addUpdateNodeEvent(node, - &nodeSyncs{true, true, true, true}); err != nil { - klog.Errorf("Failed to create node %s, error: %v", - node.Name, err) - oc.retryNodes.unSkipRetryObj(node.Name) - return - } - klog.Infof("Created Node: %s took: %v", node.Name, time.Since(start)) - oc.retryNodes.deleteRetryObj(node.Name, true) - }, - UpdateFunc: func(old, new interface{}) { - oldNode := old.(*kapi.Node) - newNode := new.(*kapi.Node) - - shouldUpdate, err := shouldUpdate(newNode, oldNode) - if err != nil { - klog.Errorf(err.Error()) - } - if !shouldUpdate { - // the hostsubnet is not assigned by ovn-kubernetes - return - } - - oc.retryNodes.skipRetryObj(oldNode.Name) - if retryEntry := oc.retryNodes.getObjRetryEntry(oldNode.Name); retryEntry != nil && retryEntry.oldObj != nil { - klog.Infof("Detected leftover old node during node update %s.", newNode.Name) - if err := oc.deleteNodeEvent(oldNode); err != nil { - oc.retryNodes.initRetryObjWithAdd(newNode, newNode.Name) - oc.retryNodes.unSkipRetryObj(oldNode.Name) - klog.Errorf("Failed to delete stale node %s, during update: %v", - oldNode.Name, err) - return - } - } - // remove the old node from retry entry since it was correctly deleted - oc.retryNodes.removeDeleteFromRetryObj(oldNode.Name) - - // determine what actually changed in this update - _, nodeSync := oc.addNodeFailed.Load(newNode.Name) - _, failed := oc.nodeClusterRouterPortFailed.Load(newNode.Name) - clusterRtrSync := failed || nodeChassisChanged(oldNode, newNode) || nodeSubnetChanged(oldNode, newNode) - _, failed = oc.mgmtPortFailed.Load(newNode.Name) - mgmtSync := failed || macAddressChanged(oldNode, newNode) || nodeSubnetChanged(oldNode, newNode) - _, failed = oc.gatewaysFailed.Load(newNode.Name) - gwSync := failed || gatewayChanged(oldNode, newNode) || nodeSubnetChanged(oldNode, newNode) || hostAddressesChanged(oldNode, newNode) - - if err := oc.addUpdateNodeEvent(newNode, - &nodeSyncs{nodeSync, clusterRtrSync, mgmtSync, gwSync}); err != nil { - klog.Errorf("Failed to update node %s, error: %v", newNode.Name, err) - oc.retryNodes.initRetryObjWithAdd(newNode, newNode.Name) - oc.retryNodes.unSkipRetryObj(newNode.Name) - return - } - oc.retryNodes.deleteRetryObj(newNode.Name, true) - }, - DeleteFunc: func(obj interface{}) { - node := obj.(*kapi.Node) - oc.retryNodes.skipRetryObj(node.Name) - oc.retryNodes.initRetryObjWithDelete(node, node.Name, nil) - if err := oc.deleteNodeEvent(node); err != nil { - oc.retryNodes.unSkipRetryObj(node.Name) - klog.Errorf("Failed to delete node %s, error: %v", node.Name, err) - return - } - oc.retryNodes.deleteRetryObj(node.Name, true) - }, - }, oc.syncNodes) - klog.Infof("Bootstrapping existing nodes and cleaning stale nodes took %v", time.Since(start)) + oc.WatchResource(oc.retryNodes) } // GetNetworkPolicyACLLogging retrieves ACL deny policy logging setting for the Namespace diff --git a/go-controller/pkg/ovn/pods.go b/go-controller/pkg/ovn/pods.go index 16557c36d4..b5264c5fec 100644 --- a/go-controller/pkg/ovn/pods.go +++ b/go-controller/pkg/ovn/pods.go @@ -25,12 +25,6 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" ) -func (oc *Controller) syncPods(pods []interface{}) { - oc.syncWithRetry("syncPods", func() error { return oc.syncPodsRetriable(pods) }) -} - -// This function implements the main body of work of syncPods. -// Upon failure, it may be invoked multiple times in order to avoid a pod restart. func (oc *Controller) syncPodsRetriable(pods []interface{}) error { var allOps []ovsdb.Operation // get the list of logical switch ports (equivalent to pods) diff --git a/go-controller/pkg/ovn/pods_retry.go b/go-controller/pkg/ovn/pods_retry.go deleted file mode 100644 index 4bd2ec1e95..0000000000 --- a/go-controller/pkg/ovn/pods_retry.go +++ /dev/null @@ -1,228 +0,0 @@ -package ovn - -import ( - "fmt" - "math/rand" - "net" - "time" - - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" - kapi "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/klog/v2" -) - -// iterateRetryPods checks if any outstanding pods have been waiting for 60 seconds of last known failure -// then tries to re-add them if so -// updateAll forces all pods to be attempted to be retried regardless of the 1 minute delay -func (oc *Controller) iterateRetryPods(updateAll bool) { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - now := time.Now() - for key, podEntry := range oc.retryPods { - if podEntry.ignore { - // neither addition nor deletion is being retried - continue - } - - pod := podEntry.pod - podDesc := fmt.Sprintf("[%s/%s/%s]", pod.UID, pod.Namespace, pod.Name) - // it could be that the Pod got deleted, but Pod's DeleteFunc has not been called yet, so don't retry creation - if podEntry.needsAdd { - kPod, err := oc.watchFactory.GetPod(pod.Namespace, pod.Name) - if err != nil && errors.IsNotFound(err) { - klog.Infof("%s pod not found in the informers cache, not going to retry pod setup", podDesc) - podEntry.needsAdd = false - } else { - pod = kPod - } - } - - if !util.PodScheduled(pod) { - klog.V(5).Infof("Retry: %s not scheduled", podDesc) - continue - } - - podEntry.backoffSec = (podEntry.backoffSec * 2) - if podEntry.backoffSec > 60 { - podEntry.backoffSec = 60 - } - backoff := (podEntry.backoffSec * time.Second) + (time.Duration(rand.Intn(500)) * time.Millisecond) - podTimer := podEntry.timeStamp.Add(backoff) - if updateAll || now.After(podTimer) { - // check if we need to retry delete first - if podEntry.needsDel != nil { - klog.Infof("%s retry pod teardown", podDesc) - if err := oc.removePod(pod, podEntry.needsDel); err != nil { - klog.Infof("%s teardown retry failed; will try again later", podDesc) - podEntry.timeStamp = time.Now() - continue // if deletion failed we will not retry add - } - klog.Infof("%s pod teardown successful", podDesc) - podEntry.needsDel = nil - if !podEntry.needsAdd { - delete(oc.retryPods, key) // this means retryDelete worked, we can remove entry safely - } - } - // check if we need to retry add - if podEntry.needsAdd { - klog.Infof("%s retry pod setup", podDesc) - if err := oc.ensurePod(nil, pod, true); err != nil { - klog.Infof("%s setup retry failed; will try again later", podDesc) - podEntry.timeStamp = time.Now() - } else { - klog.Infof("%s pod setup successful", podDesc) - delete(oc.retryPods, key) // this means retryDelete and retryAdd both worked, we can remove entry safely - } - } - } else { - klog.V(5).Infof("%s retry pod not after timer yet, time: %s", podDesc, podTimer) - } - } -} - -// checkAndDeleteRetryPod deletes a specific entry from the map, if it existed, returns true -func (oc *Controller) checkAndDeleteRetryPod(pod *kapi.Pod) bool { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - key := getPodNamespacedName(pod) - if _, ok := oc.retryPods[key]; ok { - delete(oc.retryPods, key) - return true - } - return false -} - -// checkAndSkipRetryPod sets a specific entry from the map to be ignored for subsequent retries -// if it existed, returns true -func (oc *Controller) checkAndSkipRetryPod(pod *kapi.Pod) bool { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - key := getPodNamespacedName(pod) - if entry, ok := oc.retryPods[key]; ok { - entry.ignore = true - return true - } - return false -} - -// unSkipRetryPod ensures a pod is no longer ignored for retry loop -func (oc *Controller) unSkipRetryPod(pod *kapi.Pod) { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - key := getPodNamespacedName(pod) - if entry, ok := oc.retryPods[key]; ok { - entry.ignore = false - } -} - -// initRetryAddPod tracks a pod that failed to be created to potentially retry later (needsAdd = true) -// initially it is marked as skipped for retry loop (ignore = true) -func (oc *Controller) initRetryAddPod(pod *kapi.Pod) { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - key := getPodNamespacedName(pod) - if entry, ok := oc.retryPods[key]; ok { - entry.timeStamp = time.Now() - entry.pod = pod - entry.needsAdd = true - } else { - oc.retryPods[key] = &retryEntry{pod, time.Now(), 1, true, true, nil} - } -} - -// initRetryDelPod tracks a pod that failed to be deleted to potentially retry later (needsDel != nil) -// initially it is marked as skipped for retry loop (ignore = true) -func (oc *Controller) initRetryDelPod(pod *kapi.Pod) { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - key := getPodNamespacedName(pod) - var portInfo *lpInfo - if !util.PodWantsNetwork(pod) { - // create dummy logicalPortInfo for host-networked pods - mac, _ := net.ParseMAC("00:00:00:00:00:00") - portInfo = &lpInfo{ - logicalSwitch: "host-networked", - name: getPodNamespacedName(pod), - uuid: "host-networked", - ips: []*net.IPNet{}, - mac: mac, - } - } else { - portInfo, _ = oc.logicalPortCache.get(key) - } - if entry, ok := oc.retryPods[key]; ok { - entry.timeStamp = time.Now() - entry.needsDel = portInfo - } else { - oc.retryPods[key] = &retryEntry{pod, time.Now(), 1, true, false, portInfo} - } -} - -func (oc *Controller) removeAddRetry(pod *kapi.Pod) { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - key := getPodNamespacedName(pod) - if entry, ok := oc.retryPods[key]; ok { - entry.needsAdd = false - } -} - -func (oc *Controller) removeDeleteRetry(pod *kapi.Pod) { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - key := getPodNamespacedName(pod) - if entry, ok := oc.retryPods[key]; ok { - entry.needsDel = nil - } -} - -func (oc *Controller) getPodRetryEntry(pod *kapi.Pod) *retryEntry { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - key := getPodNamespacedName(pod) - if entry, ok := oc.retryPods[key]; ok { - entryCopy := *entry - return &entryCopy - } - return nil -} - -// addRetryPods adds multiple pods to be retried later for their add events -func (oc *Controller) addRetryPods(pods []kapi.Pod) { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - for _, pod := range pods { - pod := pod - key := getPodNamespacedName(&pod) - if entry, ok := oc.retryPods[key]; ok { - entry.timeStamp = time.Now() - entry.pod = &pod - } else { - oc.retryPods[key] = &retryEntry{&pod, time.Now(), 1, false, true, nil} - } - } -} - -func (oc *Controller) requestRetryPods() { - select { - case oc.retryPodsChan <- struct{}{}: - klog.V(5).Infof("Iterate retry pods requested") - default: - klog.V(5).Infof("Iterate retry pods already requested") - } -} - -// unit testing only -func (oc *Controller) hasPodRetryEntry(pod *kapi.Pod) bool { - oc.retryPodsLock.Lock() - defer oc.retryPodsLock.Unlock() - _, ok := oc.retryPods[getPodNamespacedName(pod)] - return ok -} - -// getPodNamespacedName returns _ for the provided pod -// this is to maintain the same key format as used by logicalPortCache -func getPodNamespacedName(pod *kapi.Pod) string { - return fmt.Sprintf("%s_%s", pod.Namespace, pod.Name) -} diff --git a/go-controller/pkg/ovn/pods_test.go b/go-controller/pkg/ovn/pods_test.go index 1df4d7b889..07da868da1 100644 --- a/go-controller/pkg/ovn/pods_test.go +++ b/go-controller/pkg/ovn/pods_test.go @@ -11,6 +11,7 @@ import ( "github.com/urfave/cli/v2" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" ovstypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" @@ -19,6 +20,7 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -430,8 +432,10 @@ var _ = ginkgo.Describe("OVN Pod Operations", func() { return getPodAnnotations(fakeOvn.fakeClient.KubeClient, t2.namespace, t2.podName) }, 2).Should(gomega.HaveLen(0)) + myPod2Key, err := getResourceKey(factory.PodType, myPod2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Eventually(func() bool { - return fakeOvn.controller.hasPodRetryEntry(myPod2) + return fakeOvn.controller.retryPods.checkRetryObj(myPod2Key) }).Should(gomega.BeTrue()) ginkgo.By("Marking myPod as completed should free IP") @@ -450,10 +454,10 @@ var _ = ginkgo.Describe("OVN Pod Operations", func() { // there should also be no entry for this pod in the retry cache gomega.Eventually(func() bool { - return fakeOvn.controller.getPodRetryEntry(myPod) == nil - }, 2).Should(gomega.BeTrue()) + return fakeOvn.controller.retryPods.getObjRetryEntry(myPod2Key) == nil + }, retryObjInterval+time.Second).Should(gomega.BeTrue()) ginkgo.By("Freed IP should now allow mypod2 to come up") - fakeOvn.controller.requestRetryPods() + fakeOvn.controller.retryPods.requestRetryObjs() gomega.Eventually(func() string { return getPodAnnotations(fakeOvn.fakeClient.KubeClient, t2.namespace, t2.podName) }, 2).Should(gomega.MatchJSON(t2.getAnnotationsJson())) @@ -508,22 +512,27 @@ var _ = ginkgo.Describe("OVN Pod Operations", func() { err := fakeOvn.controller.ensurePod(nil, podObj, true) // this fails since pod doesn't exist to set annotations gomega.Expect(err).To(gomega.HaveOccurred()) - gomega.Expect(len(fakeOvn.controller.retryPods)).To(gomega.Equal(0)) - fakeOvn.controller.addRetryPods([]v1.Pod{*podObj}) - key := getPodNamespacedName(podObj) - gomega.Expect(len(fakeOvn.controller.retryPods)).To(gomega.Equal(1)) - gomega.Expect(fakeOvn.controller.retryPods[key]).ToNot(gomega.BeNil()) - gomega.Expect(fakeOvn.controller.retryPods[key].ignore).To(gomega.BeFalse()) - gomega.Expect(fakeOvn.controller.retryPods[key].pod.UID).To(gomega.Equal(podObj.UID)) + key, err := getResourceKey(factory.PodType, podObj) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + gomega.Expect(len(fakeOvn.controller.retryPods.entries)).To(gomega.Equal(0)) + fakeOvn.controller.retryPods.initRetryObjWithAdd(podObj, key) + fakeOvn.controller.retryPods.unSkipRetryObj(key) + gomega.Expect(len(fakeOvn.controller.retryPods.entries)).To(gomega.Equal(1)) + gomega.Expect(fakeOvn.controller.retryPods.entries[key]).ToNot(gomega.BeNil()) + gomega.Expect(fakeOvn.controller.retryPods.entries[key].ignore).To(gomega.BeFalse()) + storedPod, ok := fakeOvn.controller.retryPods.entries[key].newObj.(*v1.Pod) + gomega.Expect(ok).To(gomega.BeTrue()) + gomega.Expect(storedPod.UID).To(gomega.Equal(podObj.UID)) - fakeOvn.controller.checkAndSkipRetryPod(podObj) - gomega.Expect(fakeOvn.controller.retryPods[key].ignore).To(gomega.BeTrue()) + fakeOvn.controller.retryPods.skipRetryObj(key) + gomega.Expect(fakeOvn.controller.retryPods.entries[key].ignore).To(gomega.BeTrue()) - fakeOvn.controller.unSkipRetryPod(podObj) - gomega.Expect(fakeOvn.controller.retryPods[key].ignore).To(gomega.BeFalse()) + fakeOvn.controller.retryPods.unSkipRetryObj(key) + gomega.Expect(fakeOvn.controller.retryPods.entries[key].ignore).To(gomega.BeFalse()) - fakeOvn.controller.checkAndDeleteRetryPod(podObj) - gomega.Expect(fakeOvn.controller.retryPods[key]).To(gomega.BeNil()) + fakeOvn.controller.retryPods.deleteRetryObj(key, true) + gomega.Expect(fakeOvn.controller.retryPods.entries[key]).To(gomega.BeNil()) return nil } @@ -546,6 +555,9 @@ var _ = ginkgo.Describe("OVN Pod Operations", func() { ) pod := newPod(podTest.namespace, podTest.podName, podTest.nodeName, podTest.podIP) + key, err := getResourceKey(factory.PodType, pod) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + fakeOvn.startWithDBSetup(initialDB, &v1.NamespaceList{ Items: []v1.Namespace{ @@ -575,23 +587,23 @@ var _ = ginkgo.Describe("OVN Pod Operations", func() { time.Sleep(ovstypes.OVSDBTimeout + time.Second) // check to see if the pod retry cache has an entry for this policy - gomega.Eventually(func() *retryEntry { - return fakeOvn.controller.getPodRetryEntry(pod) + gomega.Eventually(func() *retryObjEntry { + return fakeOvn.controller.retryPods.getObjRetryEntry(key) }).ShouldNot(gomega.BeNil()) connCtx, cancel := context.WithTimeout(context.Background(), ovstypes.OVSDBTimeout) defer cancel() resetNBClient(connCtx, fakeOvn.controller.nbClient) - fakeOvn.controller.requestRetryPods() // retry the failed entry + fakeOvn.controller.retryPods.requestRetryObjs() // retry the failed entry - _, err := fakeOvn.fakeClient.KubeClient.CoreV1().Pods(podTest.namespace).Get(context.TODO(), podTest.podName, metav1.GetOptions{}) + _, err = fakeOvn.fakeClient.KubeClient.CoreV1().Pods(podTest.namespace).Get(context.TODO(), podTest.podName, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) fakeOvn.asf.ExpectAddressSetWithIPs(podTest.namespace, []string{podTest.podIP}) gomega.Eventually(fakeOvn.controller.nbClient).Should( libovsdbtest.HaveData(getExpectedDataPodsAndSwitches([]testPod{podTest}, []string{"node1"})...)) // check the retry cache no longer has the entry - gomega.Eventually(func() *retryEntry { - return fakeOvn.controller.getPodRetryEntry(pod) + gomega.Eventually(func() *retryObjEntry { + return fakeOvn.controller.retryPods.getObjRetryEntry(key) }).Should(gomega.BeNil()) return nil } @@ -615,7 +627,8 @@ var _ = ginkgo.Describe("OVN Pod Operations", func() { ) pod := newPod(podTest.namespace, podTest.podName, podTest.nodeName, podTest.podIP) expectedData := []libovsdbtest.TestData{getExpectedDataPodsAndSwitches([]testPod{podTest}, []string{"node1"})} - + key, err := getResourceKey(factory.PodType, pod) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) fakeOvn.startWithDBSetup(initialDB, &v1.NamespaceList{ Items: []v1.Namespace{ @@ -631,7 +644,7 @@ var _ = ginkgo.Describe("OVN Pod Operations", func() { fakeOvn.controller.WatchNamespaces() fakeOvn.controller.WatchPods() - _, err := fakeOvn.fakeClient.KubeClient.CoreV1().Pods(podTest.namespace).Get(context.TODO(), podTest.podName, metav1.GetOptions{}) + _, err = fakeOvn.fakeClient.KubeClient.CoreV1().Pods(podTest.namespace).Get(context.TODO(), podTest.podName, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Eventually(fakeOvn.nbClient).Should(libovsdbtest.HaveData(expectedData...)) fakeOvn.asf.ExpectAddressSetWithIPs(podTest.namespace, []string{podTest.podIP}) @@ -648,22 +661,22 @@ var _ = ginkgo.Describe("OVN Pod Operations", func() { // sleep long enough for TransactWithRetry to fail, causing pod delete to fail time.Sleep(ovstypes.OVSDBTimeout + time.Second) - // check to see if the pod retry cache has an entry for this policy - gomega.Eventually(func() *retryEntry { - return fakeOvn.controller.getPodRetryEntry(pod) + // check to see if the pod retry cache has an entry for this pod + gomega.Eventually(func() *retryObjEntry { + return fakeOvn.controller.retryPods.getObjRetryEntry(key) }).ShouldNot(gomega.BeNil()) connCtx, cancel := context.WithTimeout(context.Background(), ovstypes.OVSDBTimeout) defer cancel() resetNBClient(connCtx, fakeOvn.controller.nbClient) - fakeOvn.controller.requestRetryPods() // retry the failed entry + fakeOvn.controller.retryPods.requestRetryObjs() // retry the failed entry fakeOvn.asf.ExpectEmptyAddressSet(podTest.namespace) gomega.Eventually(fakeOvn.controller.nbClient).Should( libovsdbtest.HaveData(getExpectedDataPodsAndSwitches([]testPod{}, []string{"node1"})...)) // check the retry cache no longer has the entry - gomega.Eventually(func() *retryEntry { - return fakeOvn.controller.getPodRetryEntry(pod) + gomega.Eventually(func() *retryObjEntry { + return fakeOvn.controller.retryPods.getObjRetryEntry(key) }).Should(gomega.BeNil()) return nil } diff --git a/go-controller/pkg/ovn/policy.go b/go-controller/pkg/ovn/policy.go index 5fbb4c6e1b..d90462224f 100644 --- a/go-controller/pkg/ovn/policy.go +++ b/go-controller/pkg/ovn/policy.go @@ -3,9 +3,7 @@ package ovn import ( "errors" "fmt" - "math/rand" "net" - "reflect" "sync" "time" @@ -20,10 +18,10 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" kapi "k8s.io/api/core/v1" knet "k8s.io/api/networking/v1" - kerrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + kerrorsutil "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" utilnet "k8s.io/utils/net" ) @@ -65,7 +63,7 @@ type networkPolicy struct { svcHandlerList []*factory.Handler nsHandlerList []*factory.Handler - // localPods is a list of pods affected by ths policy + // localPods is a list of pods affected by this policy // this is a sync map so we can handle multiple pods at once // map of string -> *lpInfo localPods sync.Map @@ -113,13 +111,7 @@ func hashedPortGroup(s string) string { return util.HashForOVN(s) } -func (oc *Controller) syncNetworkPolicies(networkPolicies []interface{}) { - oc.syncWithRetry("syncNetworkPolicies", func() error { return oc.syncNetworkPoliciesRetriable(networkPolicies) }) -} - -// This function implements the main body of work of syncNetworkPolicies. -// Upon failure, it may be invoked multiple times in order to avoid a pod restart. -func (oc *Controller) syncNetworkPoliciesRetriable(networkPolicies []interface{}) error { +func (oc *Controller) syncNetworkPolicies(networkPolicies []interface{}) error { expectedPolicies := make(map[string]map[string]bool) for _, npInterface := range networkPolicies { policy, ok := npInterface.(*knet.NetworkPolicy) @@ -736,24 +728,28 @@ func (oc *Controller) processLocalPodSelectorSetPods(policy *knet.NetworkPolicy, // Retry if getting pod LSP from the cache fails portInfo, err = oc.logicalPortCache.get(logicalPort) if err != nil { - klog.Warningf("Failed to get LSP for pod %s/%s for networkPolicy %s refetching err: %v", pod.Namespace, pod.Name, policy.Name, err) + klog.Warningf("Failed to get LSP for pod %s/%s for networkPolicy %s refetching err: %v", + pod.Namespace, pod.Name, policy.Name, err) return false, nil } // Retry if LSP is scheduled for deletion if !portInfo.expires.IsZero() { - klog.Warningf("Stale LSP %s for network policy %s found in cache refetching", portInfo.name, policy.Name) + klog.Warningf("Stale LSP %s for network policy %s found in cache refetching", + portInfo.name, policy.Name) return false, nil } // LSP get succeeded and LSP is up to fresh, exit and continue - klog.V(5).Infof("Fresh LSP %s for network policy %s found in cache", portInfo.name, policy.Name) + klog.V(5).Infof("Fresh LSP %s for network policy %s found in cache", + portInfo.name, policy.Name) return true, nil }) if retryErr != nil { // Failed to get an up to date version of the LSP from the cache - klog.Warning("Failed to get LSP after multiple retries for pod %s/%s for networkPolicy %s err: %v", pod.Namespace, pod.Name, policy.Name, retryErr) + klog.Warning("Failed to get LSP after multiple retries for pod %s/%s for networkPolicy %s err: %v", + pod.Namespace, pod.Name, policy.Name, retryErr) return } @@ -828,47 +824,49 @@ func (oc *Controller) processLocalPodSelectorDelPods(np *networkPolicy, // handleLocalPodSelectorAddFunc adds a new pod to an existing NetworkPolicy func (oc *Controller) handleLocalPodSelectorAddFunc(policy *knet.NetworkPolicy, np *networkPolicy, - portGroupIngressDenyName, portGroupEgressDenyName string, obj interface{}) { + portGroupIngressDenyName, portGroupEgressDenyName string, obj interface{}) error { np.RLock() defer np.RUnlock() if np.deleted { - return + return nil } policyPorts, ingressDenyPorts, egressDenyPorts := oc.processLocalPodSelectorSetPods(policy, np, obj) + var errs []error + ops, err := libovsdbops.AddPortsToPortGroupOps(oc.nbClient, nil, portGroupIngressDenyName, ingressDenyPorts...) if err != nil { oc.processLocalPodSelectorDelPods(np, obj) - klog.Errorf(err.Error()) + errs = append(errs, err) } ops, err = libovsdbops.AddPortsToPortGroupOps(oc.nbClient, ops, portGroupEgressDenyName, egressDenyPorts...) if err != nil { oc.processLocalPodSelectorDelPods(np, obj) - klog.Errorf(err.Error()) + errs = append(errs, err) } ops, err = libovsdbops.AddPortsToPortGroupOps(oc.nbClient, ops, np.portGroupName, policyPorts...) if err != nil { oc.processLocalPodSelectorDelPods(np, obj) - klog.Errorf(err.Error()) + errs = append(errs, err) } _, err = libovsdbops.TransactAndCheck(oc.nbClient, ops) if err != nil { oc.processLocalPodSelectorDelPods(np, obj) - klog.Errorf(err.Error()) - return + errs = append(errs, err) } + return kerrorsutil.NewAggregate(errs) } func (oc *Controller) handleLocalPodSelectorDelFunc(policy *knet.NetworkPolicy, np *networkPolicy, - portGroupIngressDenyName, portGroupEgressDenyName string, obj interface{}) { + portGroupIngressDenyName, portGroupEgressDenyName string, obj interface{}) error { np.RLock() defer np.RUnlock() if np.deleted { - return + return nil } policyPorts, ingressDenyPorts, egressDenyPorts := oc.processLocalPodSelectorDelPods(np, obj) @@ -876,61 +874,60 @@ func (oc *Controller) handleLocalPodSelectorDelFunc(policy *knet.NetworkPolicy, ops, err := libovsdbops.DeletePortsFromPortGroupOps(oc.nbClient, nil, portGroupIngressDenyName, ingressDenyPorts...) if err != nil { oc.processLocalPodSelectorSetPods(policy, np, obj) - klog.Errorf(err.Error()) - return + return err } ops, err = libovsdbops.DeletePortsFromPortGroupOps(oc.nbClient, ops, portGroupEgressDenyName, egressDenyPorts...) if err != nil { oc.processLocalPodSelectorSetPods(policy, np, obj) - klog.Errorf(err.Error()) - return + return err } + var errs []error + ops, err = libovsdbops.DeletePortsFromPortGroupOps(oc.nbClient, ops, np.portGroupName, policyPorts...) if err != nil { oc.processLocalPodSelectorSetPods(policy, np, obj) - klog.Errorf(err.Error()) + errs = append(errs, err) } _, err = libovsdbops.TransactAndCheck(oc.nbClient, ops) if err != nil { oc.processLocalPodSelectorSetPods(policy, np, obj) - klog.Errorf(err.Error()) - return + errs = append(errs, err) } + + return kerrorsutil.NewAggregate(errs) } func (oc *Controller) handleLocalPodSelector( policy *knet.NetworkPolicy, np *networkPolicy, portGroupIngressDenyName, portGroupEgressDenyName string, - handleInitialItems func([]interface{})) { + handleInitialItems func([]interface{}) error) { - // NetworkPolicy is validated by the apiserver; this can't fail. - sel, _ := metav1.LabelSelectorAsSelector(&policy.Spec.PodSelector) - - h := oc.watchFactory.AddFilteredPodHandler(policy.Namespace, sel, - cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - oc.handleLocalPodSelectorAddFunc(policy, np, portGroupIngressDenyName, portGroupEgressDenyName, obj) - }, - DeleteFunc: func(obj interface{}) { - oc.handleLocalPodSelectorDelFunc(policy, np, portGroupIngressDenyName, portGroupEgressDenyName, obj) - }, - UpdateFunc: func(oldObj, newObj interface{}) { - pod := newObj.(*kapi.Pod) - if util.PodCompleted(pod) { - oc.handleLocalPodSelectorDelFunc(policy, np, portGroupIngressDenyName, portGroupEgressDenyName, oldObj) - } else { - oc.handleLocalPodSelectorAddFunc(policy, np, portGroupIngressDenyName, portGroupEgressDenyName, newObj) - } - }, - }, func(objs []interface{}) { - handleInitialItems(objs) + // NetworkPolicy is validated by the apiserver + sel, err := metav1.LabelSelectorAsSelector(&policy.Spec.PodSelector) + if err != nil { + klog.Errorf("Could not set up watcher for local pods: %v", err) + return + } + + retryLocalPods := NewRetryObjs( + factory.LocalPodSelectorType, + policy.Namespace, + sel, + handleInitialItems, + &NetworkPolicyExtraParameters{ + policy: policy, + np: np, + portGroupIngressDenyName: portGroupIngressDenyName, + portGroupEgressDenyName: portGroupEgressDenyName, }) + podHandler := oc.WatchResource(retryLocalPods) + np.Lock() defer np.Unlock() - np.podHandlerList = append(np.podHandlerList, h) + np.podHandlerList = append(np.podHandlerList, podHandler) } // we only need to create an address set if there is a podSelector or namespaceSelector @@ -1070,20 +1067,22 @@ func (oc *Controller) createNetworkPolicy(np *networkPolicy, policy *knet.Networ // this policy applies to. // Handle initial items locally to minimize DB ops. var selectedPods []interface{} - handleInitialSelectedPods := func(objs []interface{}) { + handleInitialSelectedPods := func(objs []interface{}) error { + var errs []error selectedPods = objs policyPorts, ingressDenyPorts, egressDenyPorts := oc.processLocalPodSelectorSetPods(policy, np, selectedPods...) pg.Ports = append(pg.Ports, policyPorts...) ops, err = libovsdbops.AddPortsToPortGroupOps(oc.nbClient, ops, portGroupIngressDenyName, ingressDenyPorts...) if err != nil { oc.processLocalPodSelectorDelPods(np, selectedPods...) - klog.Errorf(err.Error()) + errs = append(errs, err) } ops, err = libovsdbops.AddPortsToPortGroupOps(oc.nbClient, ops, portGroupEgressDenyName, egressDenyPorts...) if err != nil { oc.processLocalPodSelectorDelPods(np, selectedPods...) - klog.Errorf(err.Error()) + errs = append(errs, err) } + return kerrorsutil.NewAggregate(errs) } oc.handleLocalPodSelector(policy, np, portGroupIngressDenyName, portGroupEgressDenyName, handleInitialSelectedPods) @@ -1158,7 +1157,7 @@ func (oc *Controller) addNetworkPolicy(policy *knet.NetworkPolicy) error { if err := oc.deleteNetworkPolicy(policy, np); err != nil { // rollback failed, add to retry to cleanup key := getPolicyNamespacedName(policy) - oc.retryNetPolices.addDeleteToRetryObj(policy, key, np) + oc.retryNetworkPolicies.addDeleteToRetryObj(policy, key, np) } return fmt.Errorf("unable to ensure namespace for network policy: %s, namespace: %s, error: %v", policy.Name, policy.Namespace, err) @@ -1195,8 +1194,8 @@ func (oc *Controller) buildNetworkPolicyACLs(np *networkPolicy, aclLogging strin // deleteNetworkPolicy removes a network policy // If np is provided, then deletion may still occur without a lock on nsInfo func (oc *Controller) deleteNetworkPolicy(policy *knet.NetworkPolicy, np *networkPolicy) error { - klog.Infof("Deleting network policy %s in namespace %s", - policy.Name, policy.Namespace) + klog.Infof("Deleting network policy %s in namespace %s, np is nil: %v", + policy.Name, policy.Namespace, np == nil) nsInfo, nsUnlock := oc.getNamespaceLocked(policy.Namespace, false) if nsInfo == nil { @@ -1322,7 +1321,7 @@ func (oc *Controller) destroyNetworkPolicy(np *networkPolicy, lastPolicy bool) e // handlePeerPodSelectorAddUpdate adds the IP address of a pod that has been // selected as a peer by a NetworkPolicy's ingress/egress section to that // ingress/egress address set -func (oc *Controller) handlePeerPodSelectorAddUpdate(gp *gressPolicy, objs ...interface{}) { +func (oc *Controller) handlePeerPodSelectorAddUpdate(gp *gressPolicy, objs ...interface{}) error { pods := make([]*kapi.Pod, 0, len(objs)) for _, obj := range objs { pod := obj.(*kapi.Pod) @@ -1335,77 +1334,61 @@ func (oc *Controller) handlePeerPodSelectorAddUpdate(gp *gressPolicy, objs ...in // processed this pod event. It will grab it during the pod update event to add the annotation, // so don't log an error here. if err := gp.addPeerPods(pods...); err != nil && !errors.Is(err, util.ErrNoPodIPFound) { - klog.Errorf(err.Error()) + return err } + return nil } // handlePeerPodSelectorDelete removes the IP address of a pod that no longer // matches a NetworkPolicy ingress/egress section's selectors from that // ingress/egress address set -func (oc *Controller) handlePeerPodSelectorDelete(gp *gressPolicy, obj interface{}) { +func (oc *Controller) handlePeerPodSelectorDelete(gp *gressPolicy, obj interface{}) error { pod := obj.(*kapi.Pod) if pod.Spec.NodeName == "" { - return + klog.Infof("Pod %s/%s not scheduled on any node, skipping it", pod.Namespace, pod.Name) + return nil } if err := gp.deletePeerPod(pod); err != nil { - klog.Errorf(err.Error()) + return err } + return nil } // handlePeerServiceSelectorAddUpdate adds the VIP of a service that selects // pods that are selected by the Network Policy -func (oc *Controller) handlePeerServiceAdd(gp *gressPolicy, obj interface{}) { - service := obj.(*kapi.Service) +func (oc *Controller) handlePeerServiceAdd(gp *gressPolicy, service *kapi.Service) error { klog.V(5).Infof("A Service: %s matches the namespace as the gress policy: %s", service.Name, gp.policyName) - if err := gp.addPeerSvcVip(oc.nbClient, service); err != nil { - klog.Errorf(err.Error()) - } + return gp.addPeerSvcVip(oc.nbClient, service) } // handlePeerServiceDelete removes the VIP of a service that selects // pods that are selected by the Network Policy -func (oc *Controller) handlePeerServiceDelete(gp *gressPolicy, obj interface{}) { - service := obj.(*kapi.Service) - if err := gp.deletePeerSvcVip(oc.nbClient, service); err != nil { - klog.Errorf(err.Error()) - } +func (oc *Controller) handlePeerServiceDelete(gp *gressPolicy, service *kapi.Service) error { + return gp.deletePeerSvcVip(oc.nbClient, service) } -// Watch Services that are in the same Namespace as the NP +type NetworkPolicyExtraParameters struct { + policy *knet.NetworkPolicy + np *networkPolicy + gp *gressPolicy + podSelector labels.Selector + portGroupIngressDenyName string + portGroupEgressDenyName string +} + +// Watch services that are in the same Namespace as the NP // To account for hairpined traffic func (oc *Controller) handlePeerService( policy *knet.NetworkPolicy, gp *gressPolicy, np *networkPolicy) { - - h := oc.watchFactory.AddFilteredServiceHandler(policy.Namespace, - cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - // Service is matched so add VIP to addressSet - oc.handlePeerServiceAdd(gp, obj) - }, - DeleteFunc: func(obj interface{}) { - // If Service that has matched pods are deleted remove VIP - oc.handlePeerServiceDelete(gp, obj) - }, - UpdateFunc: func(oldObj, newObj interface{}) { - // If Service Is updated make sure same pods are still matched - oldSvc := oldObj.(*kapi.Service) - newSvc := newObj.(*kapi.Service) - if reflect.DeepEqual(newSvc.Spec.ExternalIPs, oldSvc.Spec.ExternalIPs) && - reflect.DeepEqual(newSvc.Spec.ClusterIP, oldSvc.Spec.ClusterIP) && - reflect.DeepEqual(newSvc.Spec.ClusterIPs, oldSvc.Spec.ClusterIPs) && - reflect.DeepEqual(newSvc.Spec.Type, oldSvc.Spec.Type) && - reflect.DeepEqual(newSvc.Status.LoadBalancer.Ingress, oldSvc.Status.LoadBalancer.Ingress) { - - klog.V(5).Infof("Skipping service update for: %s as change does not apply to any of "+ - ".Spec.ExternalIP, .Spec.ClusterIP, .Spec.ClusterIPs, .Spec.Type, .Status.LoadBalancer.Ingress", newSvc.Name) - return - } - - oc.handlePeerServiceDelete(gp, oldObj) - oc.handlePeerServiceAdd(gp, newObj) - }, - }, nil) - np.svcHandlerList = append(np.svcHandlerList, h) + // start watching services in the same namespace as the network policy + retryPeerServices := NewRetryObjs( + factory.PeerServiceType, + policy.Namespace, + nil, nil, + &NetworkPolicyExtraParameters{gp: gp}) + + serviceHandler := oc.WatchResource(retryPeerServices) + np.svcHandlerList = append(np.svcHandlerList, serviceHandler) } func (oc *Controller) handlePeerPodSelector( @@ -1415,26 +1398,16 @@ func (oc *Controller) handlePeerPodSelector( // NetworkPolicy is validated by the apiserver; this can't fail. sel, _ := metav1.LabelSelectorAsSelector(podSelector) - h := oc.watchFactory.AddFilteredPodHandler(policy.Namespace, sel, - cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - oc.handlePeerPodSelectorAddUpdate(gp, obj) - }, - DeleteFunc: func(obj interface{}) { - oc.handlePeerPodSelectorDelete(gp, obj) - }, - UpdateFunc: func(oldObj, newObj interface{}) { - pod := newObj.(*kapi.Pod) - if util.PodCompleted(pod) { - oc.handlePeerPodSelectorDelete(gp, oldObj) - } else { - oc.handlePeerPodSelectorAddUpdate(gp, newObj) - } - }, - }, func(objs []interface{}) { - oc.handlePeerPodSelectorAddUpdate(gp, objs...) - }) - np.podHandlerList = append(np.podHandlerList, h) + // start watching pods in the same namespace as the network policy and selected by the + // label selector + retryPeerPods := NewRetryObjs( + factory.PeerPodSelectorType, + policy.Namespace, + sel, nil, + &NetworkPolicyExtraParameters{gp: gp}) + + podHandler := oc.WatchResource(retryPeerPods) + np.podHandlerList = append(np.podHandlerList, podHandler) } func (oc *Controller) handlePeerNamespaceAndPodSelector( @@ -1447,64 +1420,23 @@ func (oc *Controller) handlePeerNamespaceAndPodSelector( nsSel, _ := metav1.LabelSelectorAsSelector(namespaceSelector) podSel, _ := metav1.LabelSelectorAsSelector(podSelector) - namespaceHandler := oc.watchFactory.AddFilteredNamespaceHandler("", nsSel, - cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - namespace := obj.(*kapi.Namespace) - np.RLock() - alreadyDeleted := np.deleted - np.RUnlock() - if alreadyDeleted { - return - } - - // The AddFilteredPodHandler call might call handlePeerPodSelectorAddUpdate - // on existing pods so we can't be holding the lock at this point - podHandler := oc.watchFactory.AddFilteredPodHandler(namespace.Name, podSel, - cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - oc.handlePeerPodSelectorAddUpdate(gp, obj) - }, - DeleteFunc: func(obj interface{}) { - oc.handlePeerPodSelectorDelete(gp, obj) - }, - UpdateFunc: func(oldObj, newObj interface{}) { - pod := oldObj.(*kapi.Pod) - if util.PodCompleted(pod) { - oc.handlePeerPodSelectorDelete(gp, oldObj) - } else { - oc.handlePeerPodSelectorAddUpdate(gp, newObj) - } - }, - }, func(objs []interface{}) { - oc.handlePeerPodSelectorAddUpdate(gp, objs...) - }) - np.Lock() - defer np.Unlock() - if np.deleted { - oc.watchFactory.RemovePodHandler(podHandler) - return - } - np.podHandlerList = append(np.podHandlerList, podHandler) - }, - DeleteFunc: func(obj interface{}) { - // when the namespace labels no longer apply - // remove the namespaces pods from the address_set - namespace := obj.(*kapi.Namespace) - pods, _ := oc.watchFactory.GetPods(namespace.Name) - - for _, pod := range pods { - oc.handlePeerPodSelectorDelete(gp, pod) - } - - }, - UpdateFunc: func(oldObj, newObj interface{}) { - }, - }, nil) + // start watching namespaces selected by the namespace selector nsSel; + // upon namespace add event, start watching pods in that namespace selected + // by the label selector podSel + retryPeerNamespaces := NewRetryObjs( + factory.PeerNamespaceAndPodSelectorType, + "", nsSel, nil, + &NetworkPolicyExtraParameters{ + gp: gp, + np: np, + podSelector: podSel}, // will be used in the addFunc to create a pod handler + ) + + namespaceHandler := oc.WatchResource(retryPeerNamespaces) np.nsHandlerList = append(np.nsHandlerList, namespaceHandler) } -func (oc *Controller) handlePeerNamespaceSelectorOnUpdate(np *networkPolicy, gp *gressPolicy, doUpdate func() bool) { +func (oc *Controller) handlePeerNamespaceSelectorOnUpdate(np *networkPolicy, gp *gressPolicy, doUpdate func() bool) error { aclLoggingLevels := oc.GetNetworkPolicyACLLogging(np.namespace) np.Lock() defer np.Unlock() @@ -1513,17 +1445,18 @@ func (oc *Controller) handlePeerNamespaceSelectorOnUpdate(np *networkPolicy, gp acls := gp.buildLocalPodACLs(np.portGroupName, aclLoggingLevels.Allow) ops, err := libovsdbops.CreateOrUpdateACLsOps(oc.nbClient, nil, acls...) if err != nil { - klog.Errorf(err.Error()) + return err } ops, err = libovsdbops.AddACLsToPortGroupOps(oc.nbClient, ops, np.portGroupName, acls...) if err != nil { - klog.Errorf(err.Error()) + return err } _, err = libovsdbops.TransactAndCheck(oc.nbClient, ops) if err != nil { - klog.Errorf(err.Error()) + return err } } + return nil } func (oc *Controller) handlePeerNamespaceSelector( @@ -1533,38 +1466,16 @@ func (oc *Controller) handlePeerNamespaceSelector( // NetworkPolicy is validated by the apiserver; this can't fail. sel, _ := metav1.LabelSelectorAsSelector(namespaceSelector) - h := oc.watchFactory.AddFilteredNamespaceHandler("", sel, - cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - namespace := obj.(*kapi.Namespace) - // Update the ACL ... - oc.handlePeerNamespaceSelectorOnUpdate(np, gress, func() bool { - // ... on condition that the added address set was not already in the 'gress policy - return gress.addNamespaceAddressSet(namespace.Name) - }) - }, - DeleteFunc: func(obj interface{}) { - namespace := obj.(*kapi.Namespace) - // Update the ACL ... - oc.handlePeerNamespaceSelectorOnUpdate(np, gress, func() bool { - // ... on condition that the removed address set was in the 'gress policy - return gress.delNamespaceAddressSet(namespace.Name) - }) - }, - UpdateFunc: func(oldObj, newObj interface{}) { - }, - }, func(i []interface{}) { - // This needs to be a write lock because there's no locking around 'gress policies - np.Lock() - defer np.Unlock() - // We load the existing address set into the 'gress policy. - // Notice that this will make the AddFunc for this initial - // address set a noop. - // The ACL must be set explicitly after setting up this handler - // for the address set to be considered. - gress.addNamespaceAddressSets(i) - }) - np.nsHandlerList = append(np.nsHandlerList, h) + // start watching namespaces selected by the namespace selector + retryPeerNamespaces := NewRetryObjs( + factory.PeerNamespaceSelectorType, + "", sel, nil, + &NetworkPolicyExtraParameters{gp: gress, np: np}, + ) + + namespaceHandler := oc.WatchResource(retryPeerNamespaces) + + np.nsHandlerList = append(np.nsHandlerList, namespaceHandler) } func (oc *Controller) shutdownHandlers(np *networkPolicy) { @@ -1579,78 +1490,6 @@ func (oc *Controller) shutdownHandlers(np *networkPolicy) { } } -// iterateRetryNetworkPolicies checks if any outstanding NetworkPolicies exist -// then tries to re-add them if so -// updateAll forces all policies to be attempted to be retried regardless -func (oc *Controller) iterateRetryNetworkPolicies(updateAll bool) { - oc.retryNetPolices.retryMutex.Lock() - defer oc.retryNetPolices.retryMutex.Unlock() - now := time.Now() - for namespacedName, npEntry := range oc.retryNetPolices.entries { - if npEntry.ignore { - continue - } - // check if we need to create - if npEntry.newObj != nil { - p := npEntry.newObj.(*knet.NetworkPolicy) - // get the latest version of the new policy from the informer, if it doesn't exist we are not going to - // create the new policy - np, err := oc.watchFactory.GetNetworkPolicy(p.Namespace, p.Name) - if err != nil && kerrors.IsNotFound(err) { - klog.Infof("%s policy not found in the informers cache, not going to retry policy create", namespacedName) - npEntry.newObj = nil - } else { - npEntry.newObj = np - } - } - - npEntry.backoffSec = npEntry.backoffSec * 2 - if npEntry.backoffSec > 60 { - npEntry.backoffSec = 60 - } - backoff := (npEntry.backoffSec * time.Second) + (time.Duration(rand.Intn(500)) * time.Millisecond) - npTimer := npEntry.timeStamp.Add(backoff) - if !updateAll && now.Before(npTimer) { - klog.V(5).Infof("%s retry network policy not after timer yet, time: %s", namespacedName, npTimer) - continue - } - klog.Infof("Network Policy Retry: %s retry network policy setup", namespacedName) - - // check if we need to delete anything - if npEntry.oldObj != nil { - var np *networkPolicy - klog.Infof("Network Policy Retry: Removing old policy for %s", namespacedName) - knp := npEntry.oldObj.(*knet.NetworkPolicy) - if npEntry.config != nil { - np = npEntry.config.(*networkPolicy) - } - if err := oc.deleteNetworkPolicy(knp, np); err != nil { - klog.Infof("Network Policy Retry delete failed for %s, will try again later: %v", - namespacedName, err) - npEntry.timeStamp = time.Now() - continue - } - // successfully cleaned up old policy, remove it from the retry cache - npEntry.oldObj = nil - } - - // create new policy if needed - if npEntry.newObj != nil { - klog.Infof("Network Policy Retry: Creating new policy for %s", namespacedName) - if err := oc.addNetworkPolicy(npEntry.newObj.(*knet.NetworkPolicy)); err != nil { - klog.Infof("Network Policy Retry create failed for %s, will try again later: %v", - namespacedName, err) - npEntry.timeStamp = time.Now() - continue - } - // successfully cleaned up old policy, remove it from the retry cache - npEntry.newObj = nil - } - - klog.Infof("Network Policy Retry successful for %s", namespacedName) - oc.retryNetPolices.deleteRetryObj(namespacedName, false) - } -} func getPolicyNamespacedName(policy *knet.NetworkPolicy) string { return fmt.Sprintf("%v/%v", policy.Namespace, policy.Name) } diff --git a/go-controller/pkg/ovn/policy_test.go b/go-controller/pkg/ovn/policy_test.go index 5b7116a40c..2e972dfe9b 100644 --- a/go-controller/pkg/ovn/policy_test.go +++ b/go-controller/pkg/ovn/policy_test.go @@ -1415,29 +1415,27 @@ var _ = ginkgo.Describe("OVN NetworkPolicy Operations", func() { // Create a second NP ginkgo.By("Creating and deleting another policy that references that pod") - _, err = fakeOvn.fakeClient.KubeClient.NetworkingV1().NetworkPolicies(networkPolicy.Namespace).Create(context.TODO(), networkPolicy2, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // sleep long enough for TransactWithRetry to fail, causing NP Add to fail time.Sleep(types.OVSDBTimeout + time.Second) // check to see if the retry cache has an entry for this policy key := getPolicyNamespacedName(networkPolicy2) - fakeOvn.controller.retryNetPolices.getObjRetryEntry(key) + fakeOvn.controller.retryNetworkPolicies.getObjRetryEntry(key) gomega.Eventually(func() *retryObjEntry { - return fakeOvn.controller.retryNetPolices.getObjRetryEntry(key) + return fakeOvn.controller.retryNetworkPolicies.getObjRetryEntry(key) }).ShouldNot(gomega.BeNil()) connCtx, cancel := context.WithTimeout(context.Background(), types.OVSDBTimeout) defer cancel() resetNBClient(connCtx, fakeOvn.controller.nbClient) - fakeOvn.controller.retryNetPolices.requestRetryObjs() + fakeOvn.controller.retryNetworkPolicies.requestRetryObjs() gressPolicy2ExpectedData := npTest.getPolicyData(networkPolicy2, []string{nPodTest.portUUID}, nil, []int32{portNum + 1}, nbdb.ACLSeverityInfo, false) expectedDataWithPolicy2 := append(expectedData, gressPolicy2ExpectedData...) gomega.Eventually(fakeOvn.nbClient).Should(libovsdb.HaveData(expectedDataWithPolicy2...)) // check the cache no longer has the entry gomega.Eventually(func() *retryObjEntry { - return fakeOvn.controller.retryNetPolices.getObjRetryEntry(key) + return fakeOvn.controller.retryNetworkPolicies.getObjRetryEntry(key) }).Should(gomega.BeNil()) return nil @@ -1558,9 +1556,9 @@ var _ = ginkgo.Describe("OVN NetworkPolicy Operations", func() { // check to see if the retry cache has an entry for this policy key := getPolicyNamespacedName(networkPolicy2) gomega.Eventually(func() *retryObjEntry { - return fakeOvn.controller.retryNetPolices.getObjRetryEntry(key) + return fakeOvn.controller.retryNetworkPolicies.getObjRetryEntry(key) }).ShouldNot(gomega.BeNil()) - if retryEntry := fakeOvn.controller.retryNetPolices.getObjRetryEntry(key); retryEntry != nil { + if retryEntry := fakeOvn.controller.retryNetworkPolicies.getObjRetryEntry(key); retryEntry != nil { ginkgo.By("retry entry new policy should be nil") gomega.Expect(retryEntry.newObj).To(gomega.BeNil()) ginkgo.By("retry entry old policy should not be nil") @@ -1590,7 +1588,7 @@ var _ = ginkgo.Describe("OVN NetworkPolicy Operations", func() { // check the cache no longer has the entry key = getPolicyNamespacedName(networkPolicy) gomega.Eventually(func() *retryObjEntry { - return fakeOvn.controller.retryNetPolices.getObjRetryEntry(key) + return fakeOvn.controller.retryNetworkPolicies.getObjRetryEntry(key) }).Should(gomega.BeNil()) return nil @@ -2294,12 +2292,12 @@ var _ = ginkgo.Describe("OVN NetworkPolicy Operations", func() { // check to see if the retry cache has an entry for this policy key := getPolicyNamespacedName(networkPolicy) gomega.Eventually(func() *retryObjEntry { - return fakeOvn.controller.retryNetPolices.getObjRetryEntry(key) + return fakeOvn.controller.retryNetworkPolicies.getObjRetryEntry(key) }).ShouldNot(gomega.BeNil()) connCtx, cancel := context.WithTimeout(context.Background(), types.OVSDBTimeout) defer cancel() resetNBClient(connCtx, fakeOvn.controller.nbClient) - fakeOvn.controller.retryNetPolices.requestRetryObjs() + fakeOvn.controller.retryNetworkPolicies.requestRetryObjs() eventuallyExpectNoAddressSets(fakeOvn, networkPolicy) @@ -2310,7 +2308,7 @@ var _ = ginkgo.Describe("OVN NetworkPolicy Operations", func() { // check the cache no longer has the entry gomega.Eventually(func() *retryObjEntry { - return fakeOvn.controller.retryNetPolices.getObjRetryEntry(key) + return fakeOvn.controller.retryNetworkPolicies.getObjRetryEntry(key) }).Should(gomega.BeNil()) return nil }