diff --git a/contrib/kind.sh b/contrib/kind.sh index 2044daa997..485d644461 100755 --- a/contrib/kind.sh +++ b/contrib/kind.sh @@ -143,6 +143,7 @@ usage() { echo "-cn | --cluster-name Configure the kind cluster's name" echo "-ric | --run-in-container Configure the script to be run from a docker container, allowing it to still communicate with the kind controlplane" echo "-ehp | --egress-ip-healthcheck-port TCP port used for gRPC session by egress IP node check. DEFAULT: 9107 (Use "0" for legacy dial to port 9)." + echo "-sm | --scale-metrics Enable scale metrics" echo "--delete Delete current cluster" echo "" } @@ -281,6 +282,8 @@ parse_args() { fi OVN_EGRESSIP_HEALTHCHECK_PORT=$1 ;; + -sm | --scale-metrics ) OVN_METRICS_SCALE_ENABLE=true + ;; --delete ) delete exit ;; @@ -337,6 +340,7 @@ print_params() { echo "OVN_ENABLE_EX_GW_NETWORK_BRIDGE = $OVN_ENABLE_EX_GW_NETWORK_BRIDGE" echo "OVN_EX_GW_NETWORK_INTERFACE = $OVN_EX_GW_NETWORK_INTERFACE" echo "OVN_EGRESSIP_HEALTHCHECK_PORT = $OVN_EGRESSIP_HEALTHCHECK_PORT" + echo "OVN_METRICS_SCALE_ENABLE = $OVN_METRICS_SCALE_ENABLE" echo "" } @@ -454,6 +458,7 @@ set_default_params() { OVN_HOST_NETWORK_NAMESPACE=${OVN_HOST_NETWORK_NAMESPACE:-ovn-host-network} OVN_EGRESSIP_HEALTHCHECK_PORT=${OVN_EGRESSIP_HEALTHCHECK_PORT:-9107} OCI_BIN=${KIND_EXPERIMENTAL_PROVIDER:-docker} + OVN_METRICS_SCALE_ENABLE=${OVN_METRICS_SCALE_ENABLE:-false} } detect_apiserver_url() { @@ -655,7 +660,8 @@ create_ovn_kube_manifests() { --egress-qos-enable=true \ --v4-join-subnet="${JOIN_SUBNET_IPV4}" \ --v6-join-subnet="${JOIN_SUBNET_IPV6}" \ - --ex-gw-network-interface="${OVN_EX_GW_NETWORK_INTERFACE}" + --ex-gw-network-interface="${OVN_EX_GW_NETWORK_INTERFACE}" \ + --ovnkube-metrics-scale-enable="${OVN_METRICS_SCALE_ENABLE}" popd } diff --git a/dist/images/daemonset.sh b/dist/images/daemonset.sh index 0983760c23..dfb0b73640 100755 --- a/dist/images/daemonset.sh +++ b/dist/images/daemonset.sh @@ -76,6 +76,7 @@ OVN_HOST_NETWORK_NAMESPACE="" OVN_EX_GW_NETWORK_INTERFACE="" OVNKUBE_NODE_MGMT_PORT_NETDEV="" OVNKUBE_CONFIG_DURATION_ENABLE= +OVNKUBE_METRICS_SCALE_ENABLE= # IN_UPGRADE is true only if called by upgrade-ovn.sh during the upgrade test, # it will render only the parts in ovn-setup.yaml related to RBAC permissions. IN_UPGRADE= @@ -256,6 +257,9 @@ while [ "$1" != "" ]; do --ovnkube-config-duration-enable) OVNKUBE_CONFIG_DURATION_ENABLE=$VALUE ;; + --ovnkube-metrics-scale-enable) + OVNKUBE_METRICS_SCALE_ENABLE=$VALUE + ;; --in-upgrade) IN_UPGRADE=true ;; @@ -393,6 +397,8 @@ ovnkube_node_mgmt_port_netdev=${OVNKUBE_NODE_MGMT_PORT_NETDEV} echo "ovnkube_node_mgmt_port_netdev: ${ovnkube_node_mgmt_port_netdev}" ovnkube_config_duration_enable=${OVNKUBE_CONFIG_DURATION_ENABLE} echo "ovnkube_config_duration_enable: ${ovnkube_config_duration_enable}" +ovnkube_metrics_scale_enable=${OVNKUBE_METRICS_SCALE_ENABLE} +echo "ovnkube_metrics_scale_enable: ${ovnkube_metrics_scale_enable}" ovn_image=${image} \ ovn_image_pull_policy=${image_pull_policy} \ @@ -473,6 +479,7 @@ ovn_image=${image} \ ovnkube_logfile_maxbackups=${ovnkube_logfile_maxbackups} \ ovnkube_logfile_maxage=${ovnkube_logfile_maxage} \ ovnkube_config_duration_enable=${ovnkube_config_duration_enable} \ + ovnkube_metrics_scale_enable=${ovnkube_metrics_scale_enable} \ ovn_acl_logging_rate_limit=${ovn_acl_logging_rate_limit} \ ovn_hybrid_overlay_net_cidr=${ovn_hybrid_overlay_net_cidr} \ ovn_hybrid_overlay_enable=${ovn_hybrid_overlay_enable} \ diff --git a/dist/images/ovnkube.sh b/dist/images/ovnkube.sh index 2ea4430b96..e370be7cf7 100755 --- a/dist/images/ovnkube.sh +++ b/dist/images/ovnkube.sh @@ -233,6 +233,7 @@ ovnkube_node_mode=${OVNKUBE_NODE_MODE:-"full"} # OVNKUBE_NODE_MGMT_PORT_NETDEV - is the net device to be used for management port ovnkube_node_mgmt_port_netdev=${OVNKUBE_NODE_MGMT_PORT_NETDEV:-} ovnkube_config_duration_enable=${OVNKUBE_CONFIG_DURATION_ENABLE:-false} +ovnkube_metrics_scale_enable=${OVNKUBE_METRICS_SCALE_ENABLE:-false} # OVN_ENCAP_IP - encap IP to be used for OVN traffic on the node ovn_encap_ip=${OVN_ENCAP_IP:-} @@ -982,6 +983,12 @@ ovn-master() { fi echo "ovnkube_config_duration_enable_flag: ${ovnkube_config_duration_enable_flag}" + ovnkube_metrics_scale_enable_flag= + if [[ ${ovnkube_metrics_scale_enable} == "true" ]]; then + ovnkube_metrics_scale_enable_flag="--metrics-enable-scale" + fi + echo "ovnkube_metrics_scale_enable_flag: ${ovnkube_metrics_scale_enable_flag}" + echo "=============== ovn-master ========== MASTER ONLY" /usr/bin/ovnkube \ --init-master ${K8S_NODE} \ @@ -1008,6 +1015,7 @@ ovn-master() { ${egressfirewall_enabled_flag} \ ${egressqos_enabled_flag} \ ${ovnkube_config_duration_enable_flag} \ + ${ovnkube_metrics_scale_enable_flag} \ --metrics-bind-address ${ovnkube_master_metrics_bind_address} \ --host-network-namespace ${ovn_host_network_namespace} & diff --git a/dist/templates/ovnkube-master.yaml.j2 b/dist/templates/ovnkube-master.yaml.j2 index 1724eb3cc6..8f5c8d2305 100644 --- a/dist/templates/ovnkube-master.yaml.j2 +++ b/dist/templates/ovnkube-master.yaml.j2 @@ -164,6 +164,8 @@ spec: value: "{{ ovnkube_logfile_maxage }}" - name: OVNKUBE_CONFIG_DURATION_ENABLE value: "{{ ovnkube_config_duration_enable }}" + - name: OVNKUBE_METRICS_SCALE_ENABLE + value: "{{ ovnkube_metrics_scale_enable }}" - name: OVN_NET_CIDR valueFrom: configMapKeyRef: diff --git a/go-controller/pkg/config/config.go b/go-controller/pkg/config/config.go index 5da94bdcf0..ad0bca208d 100644 --- a/go-controller/pkg/config/config.go +++ b/go-controller/pkg/config/config.go @@ -326,8 +326,8 @@ type MetricsConfig struct { NodeServerCert string `gcfg:"node-server-cert"` // EnableConfigDuration holds the boolean flag to enable OVN-Kubernetes master to monitor OVN-Kubernetes master // configuration duration and optionally, its application to all nodes - EnableConfigDuration bool `gcfg:"enable-config-duration"` - EnableEIPScaleMetrics bool `gcfg:"enable-eip-scale-metrics"` + EnableConfigDuration bool `gcfg:"enable-config-duration"` + EnableScaleMetrics bool `gcfg:"enable-scale-metrics"` } // OVNKubernetesFeatureConfig holds OVN-Kubernetes feature enhancement config file parameters and command-line overrides @@ -1015,9 +1015,9 @@ var MetricsFlags = []cli.Flag{ Destination: &cliConfig.Metrics.EnableConfigDuration, }, &cli.BoolFlag{ - Name: "metrics-enable-eip-scale", - Usage: "Enables metrics related to Egress IP scaling", - Destination: &cliConfig.Metrics.EnableEIPScaleMetrics, + Name: "metrics-enable-scale", + Usage: "Enables metrics related to scaling", + Destination: &cliConfig.Metrics.EnableScaleMetrics, }, } diff --git a/go-controller/pkg/config/config_test.go b/go-controller/pkg/config/config_test.go index 4d7811accb..e461b6d5e4 100644 --- a/go-controller/pkg/config/config_test.go +++ b/go-controller/pkg/config/config_test.go @@ -158,7 +158,7 @@ enable-pprof=true node-server-privkey=/path/to/node-metrics-private.key node-server-cert=/path/to/node-metrics.crt enable-config-duration=true -enable-eip-scale-metrics=true +enable-scale-metrics=true [logging] loglevel=5 @@ -581,7 +581,7 @@ var _ = Describe("Config Operations", func() { gomega.Expect(Metrics.NodeServerPrivKey).To(gomega.Equal("/path/to/node-metrics-private.key")) gomega.Expect(Metrics.NodeServerCert).To(gomega.Equal("/path/to/node-metrics.crt")) gomega.Expect(Metrics.EnableConfigDuration).To(gomega.Equal(true)) - gomega.Expect(Metrics.EnableEIPScaleMetrics).To(gomega.Equal(true)) + gomega.Expect(Metrics.EnableScaleMetrics).To(gomega.Equal(true)) gomega.Expect(OvnNorth.Scheme).To(gomega.Equal(OvnDBSchemeSSL)) gomega.Expect(OvnNorth.PrivKey).To(gomega.Equal("/path/to/nb-client-private.key")) @@ -667,7 +667,7 @@ var _ = Describe("Config Operations", func() { gomega.Expect(Metrics.NodeServerPrivKey).To(gomega.Equal("/tls/nodeprivkey")) gomega.Expect(Metrics.NodeServerCert).To(gomega.Equal("/tls/nodecert")) gomega.Expect(Metrics.EnableConfigDuration).To(gomega.Equal(true)) - gomega.Expect(Metrics.EnableEIPScaleMetrics).To(gomega.Equal(true)) + gomega.Expect(Metrics.EnableScaleMetrics).To(gomega.Equal(true)) gomega.Expect(OvnNorth.Scheme).To(gomega.Equal(OvnDBSchemeSSL)) gomega.Expect(OvnNorth.PrivKey).To(gomega.Equal("/client/privkey")) diff --git a/go-controller/pkg/metrics/master.go b/go-controller/pkg/metrics/master.go index a395438e4e..85b9045178 100644 --- a/go-controller/pkg/metrics/master.go +++ b/go-controller/pkg/metrics/master.go @@ -146,6 +146,17 @@ var MetricMasterReadyDuration = prometheus.NewGauge(prometheus.GaugeOpts{ Help: "The duration for the master to get to ready state", }) +// MetricMasterSyncDuration is the time taken to complete initial Watch for different resource. +// Resource name is in the label. +var MetricMasterSyncDuration = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "sync_duration_seconds", + Help: "The duration to sync and setup all handlers for a given resource"}, + []string{ + "resource_name", + }) + // MetricMasterLeader identifies whether this instance of ovnkube-master is a leader or not var MetricMasterLeader = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: MetricOvnkubeNamespace, @@ -219,6 +230,66 @@ var metricEgressIPRebalanceCount = prometheus.NewCounter(prometheus.CounterOpts{ Help: "The total number of times assigned egress IP(s) needed to be moved to a different node"}, ) +var metricNetpolEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "network_policy_event_latency_seconds", + Help: "The latency of full network policy event handling (create, delete)", + Buckets: prometheus.ExponentialBuckets(.004, 2, 15)}, + []string{ + "event", + }) + +var metricNetpolLocalPodEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "network_policy_local_pod_event_latency_seconds", + Help: "The latency of local pod events handling (add, delete)", + Buckets: prometheus.ExponentialBuckets(.002, 2, 15)}, + []string{ + "event", + }) + +var metricNetpolPeerPodEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "network_policy_peer_pod_event_latency_seconds", + Help: "The latency of peer pod events handling (add, delete)", + Buckets: prometheus.ExponentialBuckets(.002, 2, 15)}, + []string{ + "event", + }) + +var metricNetpolPeerNamespaceEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "network_policy_peer_namespace_event_latency_seconds", + Help: "The latency of peer namespace events handling (add, delete)", + Buckets: prometheus.ExponentialBuckets(.002, 2, 15)}, + []string{ + "event", + }) + +var metricNetpolPeerNamespaceAndPodEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "network_policy_peer_namespace_and_pod_event_latency_seconds", + Help: "The latency of peer namespace events handling (add, delete)", + Buckets: prometheus.ExponentialBuckets(.002, 2, 15)}, + []string{ + "event", + }) + +var metricPodEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "pod_event_latency_seconds", + Help: "The latency of pod events handling (add, update, delete)", + Buckets: prometheus.ExponentialBuckets(.002, 2, 15)}, + []string{ + "event", + }) + var metricEgressFirewallRuleCount = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: MetricOvnkubeNamespace, Subsystem: MetricOvnkubeSubsystemMaster, @@ -317,6 +388,7 @@ const ( func RegisterMasterBase() { prometheus.MustRegister(MetricMasterLeader) prometheus.MustRegister(MetricMasterReadyDuration) + prometheus.MustRegister(MetricMasterSyncDuration) prometheus.MustRegister(prometheus.NewGaugeFunc( prometheus.GaugeOpts{ Namespace: MetricOvnkubeNamespace, @@ -374,9 +446,16 @@ func RegisterMasterFunctional() { prometheus.MustRegister(metricV4AllocatedHostSubnetCount) prometheus.MustRegister(metricV6AllocatedHostSubnetCount) prometheus.MustRegister(metricEgressIPCount) - if config.Metrics.EnableEIPScaleMetrics { + if config.Metrics.EnableScaleMetrics { + klog.Infof("Scale metrics are enabled") prometheus.MustRegister(metricEgressIPAssignLatency) prometheus.MustRegister(metricEgressIPUnassignLatency) + prometheus.MustRegister(metricNetpolEventLatency) + prometheus.MustRegister(metricNetpolLocalPodEventLatency) + prometheus.MustRegister(metricNetpolPeerPodEventLatency) + prometheus.MustRegister(metricNetpolPeerNamespaceEventLatency) + prometheus.MustRegister(metricNetpolPeerNamespaceAndPodEventLatency) + prometheus.MustRegister(metricPodEventLatency) } prometheus.MustRegister(metricEgressIPNodeUnreacheableCount) prometheus.MustRegister(metricEgressIPRebalanceCount) @@ -489,6 +568,30 @@ func RecordEgressIPRebalance(count int) { metricEgressIPRebalanceCount.Add(float64(count)) } +func RecordNetpolEvent(eventName string, duration time.Duration) { + metricNetpolEventLatency.WithLabelValues(eventName).Observe(duration.Seconds()) +} + +func RecordNetpolLocalPodEvent(eventName string, duration time.Duration) { + metricNetpolLocalPodEventLatency.WithLabelValues(eventName).Observe(duration.Seconds()) +} + +func RecordNetpolPeerPodEvent(eventName string, duration time.Duration) { + metricNetpolPeerPodEventLatency.WithLabelValues(eventName).Observe(duration.Seconds()) +} + +func RecordNetpolPeerNamespaceEvent(eventName string, duration time.Duration) { + metricNetpolPeerNamespaceEventLatency.WithLabelValues(eventName).Observe(duration.Seconds()) +} + +func RecordNetpolPeerNamespaceAndPodEvent(eventName string, duration time.Duration) { + metricNetpolPeerNamespaceAndPodEventLatency.WithLabelValues(eventName).Observe(duration.Seconds()) +} + +func RecordPodEvent(eventName string, duration time.Duration) { + metricPodEventLatency.WithLabelValues(eventName).Observe(duration.Seconds()) +} + // UpdateEgressFirewallRuleCount records the number of Egress firewall rules. func UpdateEgressFirewallRuleCount(count float64) { metricEgressFirewallRuleCount.Add(count) diff --git a/go-controller/pkg/ovn/egressip.go b/go-controller/pkg/ovn/egressip.go index 953ab300e5..2c47084434 100644 --- a/go-controller/pkg/ovn/egressip.go +++ b/go-controller/pkg/ovn/egressip.go @@ -2171,7 +2171,7 @@ func (oc *Controller) addStandByEgressIPAssignment(podKey string, podStatus *pod // (routing pod traffic to the egress node) and NAT objects on the egress node // (SNAT-ing to the egress IP). func (e *egressIPController) addPodEgressIPAssignment(egressIPName string, status egressipv1.EgressIPStatusItem, pod *kapi.Pod, podIPs []*net.IPNet) (err error) { - if config.Metrics.EnableEIPScaleMetrics { + if config.Metrics.EnableScaleMetrics { start := time.Now() defer func() { if err != nil { @@ -2199,7 +2199,7 @@ func (e *egressIPController) addPodEgressIPAssignment(egressIPName string, statu // deletePodEgressIPAssignment deletes the OVN programmed egress IP // configuration mentioned for addPodEgressIPAssignment. func (e *egressIPController) deletePodEgressIPAssignment(egressIPName string, status egressipv1.EgressIPStatusItem, podIPs []*net.IPNet) (err error) { - if config.Metrics.EnableEIPScaleMetrics { + if config.Metrics.EnableScaleMetrics { start := time.Now() defer func() { if err != nil { diff --git a/go-controller/pkg/ovn/ovn.go b/go-controller/pkg/ovn/ovn.go index 4c18f11da7..fc9e9824a8 100644 --- a/go-controller/pkg/ovn/ovn.go +++ b/go-controller/pkg/ovn/ovn.go @@ -317,35 +317,39 @@ func (oc *Controller) Run(ctx context.Context, wg *sync.WaitGroup) error { // Sync external gateway routes. External gateway may be set in namespaces // or via pods. So execute an individual sync method at startup - oc.cleanExGwECMPRoutes() + WithSyncDurationMetricNoError("external gateway routes", oc.cleanExGwECMPRoutes) // WatchNamespaces() should be started first because it has no other // dependencies, and WatchNodes() depends on it - if err := oc.WatchNamespaces(); err != nil { + if err := WithSyncDurationMetric("namespace", oc.WatchNamespaces); err != nil { return err } // WatchNodes must be started next because it creates the node switch // which most other watches depend on. // https://github.com/ovn-org/ovn-kubernetes/pull/859 - if err := oc.WatchNodes(); err != nil { + if err := WithSyncDurationMetric("node", oc.WatchNodes); err != nil { return err } + startSvc := time.Now() // Start service watch factory and sync services oc.svcFactory.Start(oc.stopChan) // Services should be started after nodes to prevent LB churn - if err := oc.StartServiceController(wg, true); err != nil { + err := oc.StartServiceController(wg, true) + endSvc := time.Since(startSvc) + metrics.MetricMasterSyncDuration.WithLabelValues("service").Set(endSvc.Seconds()) + if err != nil { return err } - if err := oc.WatchPods(); err != nil { + if err := WithSyncDurationMetric("pod", oc.WatchPods); err != nil { return err } // WatchNetworkPolicy depends on WatchPods and WatchNamespaces - if err := oc.WatchNetworkPolicy(); err != nil { + if err := WithSyncDurationMetric("network policy", oc.WatchNetworkPolicy); err != nil { return err } @@ -360,16 +364,16 @@ func (oc *Controller) Run(ctx context.Context, wg *sync.WaitGroup) error { // risk performing a bunch of modifications on the EgressIP objects when // we restart and then have these handlers act on stale data when they // sync. - if err := oc.WatchEgressIPNamespaces(); err != nil { + if err := WithSyncDurationMetric("egress ip namespace", oc.WatchEgressIPNamespaces); err != nil { return err } - if err := oc.WatchEgressIPPods(); err != nil { + if err := WithSyncDurationMetric("egress ip pod", oc.WatchEgressIPPods); err != nil { return err } - if err := oc.WatchEgressNodes(); err != nil { + if err := WithSyncDurationMetric("egress node", oc.WatchEgressNodes); err != nil { return err } - if err := oc.WatchEgressIP(); err != nil { + if err := WithSyncDurationMetric("egress ip", oc.WatchEgressIP); err != nil { return err } if util.PlatformTypeIsEgressIPCloudProvider() { @@ -392,7 +396,7 @@ func (oc *Controller) Run(ctx context.Context, wg *sync.WaitGroup) error { return err } oc.egressFirewallDNS.Run(egressFirewallDNSDefaultDuration) - err = oc.WatchEgressFirewall() + err = WithSyncDurationMetric("egress firewall", oc.WatchEgressFirewall) if err != nil { return err } @@ -416,7 +420,9 @@ func (oc *Controller) Run(ctx context.Context, wg *sync.WaitGroup) error { oc.egressSvcController.Run(1) }() - klog.Infof("Completing all the Watchers took %v", time.Since(start)) + end := time.Since(start) + klog.Infof("Completing all the Watchers took %v", end) + metrics.MetricMasterSyncDuration.WithLabelValues("all watchers").Set(end.Seconds()) if config.Kubernetes.OVNEmptyLbEvents { klog.Infof("Starting unidling controller") @@ -436,7 +442,7 @@ func (oc *Controller) Run(ctx context.Context, wg *sync.WaitGroup) error { } // Final step to cleanup after resource handlers have synced - err := oc.ovnTopologyCleanup() + err = oc.ovnTopologyCleanup() if err != nil { klog.Errorf("Failed to cleanup OVN topology to version %d: %v", ovntypes.OvnCurrentTopologyVersion, err) return err @@ -451,6 +457,24 @@ func (oc *Controller) Run(ctx context.Context, wg *sync.WaitGroup) error { return nil } +func WithSyncDurationMetric(resourceName string, f func() error) error { + start := time.Now() + defer func() { + end := time.Since(start) + metrics.MetricMasterSyncDuration.WithLabelValues(resourceName).Set(end.Seconds()) + }() + return f() +} + +func WithSyncDurationMetricNoError(resourceName string, f func()) { + start := time.Now() + defer func() { + end := time.Since(start) + metrics.MetricMasterSyncDuration.WithLabelValues(resourceName).Set(end.Seconds()) + }() + f() +} + // syncPeriodic adds a goroutine that periodically does some work // right now there is only one ticker registered // for syncNodesPeriodic which deletes chassis records from the sbdb @@ -498,6 +522,17 @@ func (oc *Controller) ensurePod(oldPod, pod *kapi.Pod, addPort bool) error { if !util.PodScheduled(pod) { return nil } + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + eventName := "add" + if !addPort { + eventName = "update" + } + metrics.RecordPodEvent(eventName, duration) + }() + } if oldPod != nil && (exGatewayAnnotationsChanged(oldPod, pod) || networkStatusAnnotationsChanged(oldPod, pod)) { // No matter if a pod is ovn networked, or host networked, we still need to check for exgw @@ -527,6 +562,13 @@ func (oc *Controller) ensurePod(oldPod, pod *kapi.Pod, addPort bool) error { // removePod tried to tear down a pod. It returns nil on success and error on failure; // failure indicates the pod tear down should be retried later. func (oc *Controller) removePod(pod *kapi.Pod, portInfo *lpInfo) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordPodEvent("delete", duration) + }() + } if !util.PodWantsNetwork(pod) { if err := oc.deletePodExternalGW(pod); err != nil { return fmt.Errorf("unable to delete external gateway routes for pod %s: %w", diff --git a/go-controller/pkg/ovn/policy.go b/go-controller/pkg/ovn/policy.go index 3928e4753d..0cd6a93376 100644 --- a/go-controller/pkg/ovn/policy.go +++ b/go-controller/pkg/ovn/policy.go @@ -6,9 +6,11 @@ import ( "strconv" "strings" "sync" + "time" libovsdbclient "github.com/ovn-org/libovsdb/client" ovsdb "github.com/ovn-org/libovsdb/ovsdb" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdbops" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" @@ -844,6 +846,13 @@ func (oc *Controller) denyPGDeletePorts(np *networkPolicy, portNamesToUUIDs map[ // handleLocalPodSelectorAddFunc adds a new pod to an existing NetworkPolicy, should be retriable. func (oc *Controller) handleLocalPodSelectorAddFunc(np *networkPolicy, objs ...interface{}) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolLocalPodEvent("add", duration) + }() + } np.RLock() defer np.RUnlock() if np.deleted { @@ -888,6 +897,13 @@ func (oc *Controller) handleLocalPodSelectorAddFunc(np *networkPolicy, objs ...i // handleLocalPodSelectorDelFunc handles delete event for local pod, should be retriable func (oc *Controller) handleLocalPodSelectorDelFunc(np *networkPolicy, objs ...interface{}) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolLocalPodEvent("delete", duration) + }() + } np.RLock() defer np.RUnlock() if np.deleted { @@ -1206,7 +1222,13 @@ func (oc *Controller) createNetworkPolicy(policy *knet.NetworkPolicy, aclLogging // if addNetworkPolicy fails, create or delete operation can be retried func (oc *Controller) addNetworkPolicy(policy *knet.NetworkPolicy) error { klog.Infof("Adding network policy %s", getPolicyKey(policy)) - + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolEvent("add", duration) + }() + } // To not hold nsLock for the whole process on network policy creation, we do the following: // 1. save required namespace information to use for netpol create // 2. create network policy without ns Lock @@ -1311,7 +1333,13 @@ func (oc *Controller) buildNetworkPolicyACLs(np *networkPolicy, aclLogging *ACLL func (oc *Controller) deleteNetworkPolicy(policy *knet.NetworkPolicy) error { npKey := getPolicyKey(policy) klog.Infof("Deleting network policy %s", npKey) - + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolEvent("delete", duration) + }() + } // First lock and update namespace nsInfo, nsUnlock := oc.getNamespaceLocked(policy.Namespace, false) if nsInfo != nil { @@ -1407,6 +1435,13 @@ func (oc *Controller) cleanupNetworkPolicy(np *networkPolicy) error { // selected as a peer by a NetworkPolicy's ingress/egress section to that // ingress/egress address set func (oc *Controller) handlePeerPodSelectorAddUpdate(np *networkPolicy, gp *gressPolicy, objs ...interface{}) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolPeerPodEvent("add", duration) + }() + } np.RLock() defer np.RUnlock() if np.deleted { @@ -1429,6 +1464,13 @@ func (oc *Controller) handlePeerPodSelectorAddUpdate(np *networkPolicy, gp *gres // matches a NetworkPolicy ingress/egress section's selectors from that // ingress/egress address set func (oc *Controller) handlePeerPodSelectorDelete(np *networkPolicy, gp *gressPolicy, podSelector labels.Selector, obj interface{}) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolPeerPodEvent("delete", duration) + }() + } np.RLock() defer np.RUnlock() if np.deleted { @@ -1565,6 +1607,13 @@ func (oc *Controller) addPeerPodHandler(podSelector *metav1.LabelSelector, func (oc *Controller) handlePeerNamespaceAndPodAdd(np *networkPolicy, gp *gressPolicy, podSelector labels.Selector, obj interface{}) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolPeerNamespaceAndPodEvent("add", duration) + }() + } namespace := obj.(*kapi.Namespace) np.RLock() locked := true @@ -1614,6 +1663,13 @@ func (oc *Controller) handlePeerNamespaceAndPodAdd(np *networkPolicy, gp *gressP } func (oc *Controller) handlePeerNamespaceAndPodDel(np *networkPolicy, gp *gressPolicy, obj interface{}) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolPeerNamespaceAndPodEvent("delete", duration) + }() + } np.RLock() defer np.RUnlock() if np.deleted { @@ -1679,6 +1735,13 @@ func (oc *Controller) addPeerNamespaceAndPodHandler(namespaceSelector *metav1.La } func (oc *Controller) handlePeerNamespaceSelectorAdd(np *networkPolicy, gp *gressPolicy, objs ...interface{}) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolPeerNamespaceEvent("add", duration) + }() + } np.RLock() if np.deleted { np.RUnlock() @@ -1701,6 +1764,13 @@ func (oc *Controller) handlePeerNamespaceSelectorAdd(np *networkPolicy, gp *gres } func (oc *Controller) handlePeerNamespaceSelectorDel(np *networkPolicy, gp *gressPolicy, objs ...interface{}) error { + if config.Metrics.EnableScaleMetrics { + start := time.Now() + defer func() { + duration := time.Since(start) + metrics.RecordNetpolPeerNamespaceEvent("delete", duration) + }() + } np.RLock() if np.deleted { np.RUnlock()