-
Notifications
You must be signed in to change notification settings - Fork 1.6k
[BPF] Maglev Prometheus Metrics: Connection counts #11660
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0acf04c
93faa9b
13aa2a8
ddb1cf8
11546ee
bca2622
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -399,6 +399,8 @@ func describeBPFTests(opts ...bpfTestOpt) bool { | |
| options.ExtraEnvVars["FELIX_BPFLogLevel"] = fmt.Sprint(testOpts.bpfLogLevel) | ||
| options.ExtraEnvVars["FELIX_BPFConntrackLogLevel"] = fmt.Sprint(testOpts.bpfLogLevel) | ||
| options.ExtraEnvVars["FELIX_BPFProfiling"] = "Enabled" | ||
| options.ExtraEnvVars["FELIX_PrometheusMetricsEnabled"] = "true" | ||
| options.ExtraEnvVars["FELIX_PrometheusMetricsHost"] = "0.0.0.0" | ||
| if testOpts.dsr { | ||
| options.ExtraEnvVars["FELIX_BPFExternalServiceMode"] = "dsr" | ||
| } | ||
|
|
@@ -1906,6 +1908,10 @@ func describeBPFTests(opts ...bpfTestOpt) bool { | |
| familyInt = 6 | ||
| } | ||
|
|
||
| felixWithMaglevBackend := 0 | ||
| initialIngressFelix := 1 | ||
| failoverIngressFelix := 2 | ||
|
|
||
| newConntrackKey := func(srcIP net.IP, srcPort uint16, dstIP net.IP, dstPort uint16, family string) conntrack.KeyInterface { | ||
| var key conntrack.KeyInterface | ||
| // cmp := bytes.Compare(srcIP, dstIP) | ||
|
|
@@ -1982,6 +1988,19 @@ func describeBPFTests(opts ...bpfTestOpt) bool { | |
| } | ||
| } | ||
|
|
||
| probeMaglevConntrackMetric := func(metricName string, felixes ...*infrastructure.Felix) []int { | ||
| counts := make([]int, 0) | ||
| for _, f := range felixes { | ||
| ctCount, err := f.PromMetric(metricName).Int() | ||
| if err != nil { | ||
| log.WithError(err).WithField("felix", f.Name).Warn("Error while probing Felix metric. Skipping this felix") | ||
| continue | ||
| } | ||
| counts = append(counts, ctCount) | ||
| } | ||
| return counts | ||
| } | ||
|
|
||
| BeforeEach(func() { | ||
| switch testOpts.protocol { | ||
| case "udp": | ||
|
|
@@ -1995,11 +2014,24 @@ func describeBPFTests(opts ...bpfTestOpt) bool { | |
| } | ||
| log.WithFields(log.Fields{"number": proto, "name": testOpts.protocol}).Info("parsed protocol") | ||
|
|
||
| pTCP := numorstring.ProtocolFromString("tcp") | ||
| promPinhole := api.Rule{ | ||
| Action: "Allow", | ||
| Protocol: &pTCP, | ||
| Destination: api.EntityRule{ | ||
| Ports: []numorstring.Port{ | ||
| {MinPort: 9091, MaxPort: 9091}, | ||
| }, | ||
| Nets: []string{}, | ||
| }, | ||
| } | ||
|
|
||
| // Create policy allowing ingress from external client | ||
| allowIngressFromExtClient := api.NewGlobalNetworkPolicy() | ||
| allowIngressFromExtClient.Namespace = "fv" | ||
| allowIngressFromExtClient.Name = "policy-ext-client" | ||
| allowIngressFromExtClient.Spec.Ingress = []api.Rule{ | ||
| promPinhole, | ||
| { | ||
| Action: "Allow", | ||
| Source: api.EntityRule{ | ||
|
|
@@ -2015,11 +2047,12 @@ func describeBPFTests(opts ...bpfTestOpt) bool { | |
| allowIngressFromExtClient = createPolicy(allowIngressFromExtClient) | ||
|
|
||
| // Create service with maglev annotation | ||
| testSvc = k8sServiceWithExtIP(testSvcName, clusterIP, w[0][0], 80, tgtPort, 0, | ||
| testSvc = k8sServiceWithExtIP(testSvcName, clusterIP, w[felixWithMaglevBackend][0], 80, tgtPort, 0, | ||
| testOpts.protocol, []string{externalIP}) | ||
| testSvc.Annotations = map[string]string{ | ||
| "lb.projectcalico.org/external-traffic-strategy": "maglev", | ||
| } | ||
|
|
||
| testSvcNamespace = testSvc.Namespace | ||
| _, err := k8sClient.CoreV1().Services(testSvcNamespace).Create(context.Background(), testSvc, metav1.CreateOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
|
|
@@ -2071,16 +2104,39 @@ func describeBPFTests(opts ...bpfTestOpt) bool { | |
| cmdCleanRt = append(ipRoute, "route", "del", externalIP) | ||
| _ = externalClient.ExecMayFail(strings.Join(cmdCleanRt, "")) | ||
|
|
||
| cmdCIP := append(ipRoute, "route", "add", clusterIP, "via", felixIP(1)) | ||
| cmdCIP := append(ipRoute, "route", "add", clusterIP, "via", felixIP(initialIngressFelix)) | ||
| externalClient.Exec(cmdCIP...) | ||
| cmdEIP := append(ipRoute, "route", "add", externalIP, "via", felixIP(1)) | ||
| cmdEIP := append(ipRoute, "route", "add", externalIP, "via", felixIP(initialIngressFelix)) | ||
| externalClient.Exec(cmdEIP...) | ||
| }) | ||
|
|
||
| It("should have connectivity from external client to maglev backend via cluster IP and external IP", func() { | ||
| probeMaglevLocalConntrackMetricFunc := func(felixes ...*infrastructure.Felix) func() []int { | ||
| return func() []int { | ||
| return probeMaglevConntrackMetric(fmt.Sprintf("felix_bpf_conntrack_maglev_entries_total{destination=\"local\",ip_family=\"%d\"}", familyInt), felixes...) | ||
| } | ||
| } | ||
| probeMaglevRemoteConntrackMetricFunc := func(felixes ...*infrastructure.Felix) func() []int { | ||
| return func() []int { | ||
| return probeMaglevConntrackMetric(fmt.Sprintf("felix_bpf_conntrack_maglev_entries_total{destination=\"remote\",ip_family=\"%d\"}", familyInt), felixes...) | ||
| } | ||
| } | ||
|
|
||
| Eventually(probeMaglevLocalConntrackMetricFunc(tc.Felixes...), "10s", "1s").Should(Equal([]int{0, 0, 0}), "Expected maglev local-conntrack metric to start at 0 for all Felixes") | ||
| Eventually(probeMaglevRemoteConntrackMetricFunc(tc.Felixes...), "10s", "1s").Should(Equal([]int{0, 0, 0}), "Expected maglev remote-conntrack metric to start at 0 for all Felixes") | ||
|
|
||
| cc.ExpectSome(externalClient, TargetIP(clusterIP), port) | ||
| cc.ExpectSome(externalClient, TargetIP(externalIP), port) | ||
| cc.CheckConnectivity() | ||
|
|
||
| // There is a 10-second interval between iterations of Felix's conntrack scanner (where we export the maglev conntrack metrics). | ||
| // This means we must be very pessimistic about timeouts when searching for the prom values we're after. | ||
| Eventually(probeMaglevRemoteConntrackMetricFunc(tc.Felixes[initialIngressFelix]), "12s", "1s").Should(Equal([]int{2}), "Expected maglev-ingress felix to increment the remote-conntracks metric") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is 2 guaranteed to be correct? I guess, even if there was a retry, maglev would choose same backend and get same conntrack?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe two is guaranteed, yep. One for cluster IP, one for external IP. |
||
| Eventually(probeMaglevLocalConntrackMetricFunc(tc.Felixes[felixWithMaglevBackend]), "12s", "1s").Should(Equal([]int{2}), "Expected felix with maglev backend to increment the local-conntracks metric") | ||
| Consistently(probeMaglevLocalConntrackMetricFunc(tc.Felixes[initialIngressFelix])).Should(Equal([]int{0}), "Expected ingress-felix to only have remote maglev conntracks, but saw metric for local maglev conntracks go up") | ||
| Consistently(probeMaglevRemoteConntrackMetricFunc(tc.Felixes[felixWithMaglevBackend])).Should(Equal([]int{0}), "Expected backing felix to only have local maglev conntracks, but saw metric for remote maglev conntracks go up") | ||
| Consistently(probeMaglevLocalConntrackMetricFunc(tc.Felixes[failoverIngressFelix])).Should(Equal([]int{0}), "No failover occurred, but an unrelated Felix's local maglev prom metrics went up") | ||
| Consistently(probeMaglevRemoteConntrackMetricFunc(tc.Felixes[failoverIngressFelix])).Should(Equal([]int{0}), "No failover occurred, but an unrelated Felix's remote maglev prom metrics went up") | ||
| }) | ||
|
|
||
| testFailover := func(serviceIP string) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should move to
case ScanVerdictOKas you probably only care about active connections. This would also count connections that are expired and are going to be removed or do not have backed anymore etc.Maybe you want to do it in the cleaner since you may not want to count connections in some FIN-wait state, where they may be for quite a while, but for the purpose of "can I remove this node" they do not matter anymore.
When @sridhartigera makes the change to send RSTs when the backend is no more, this would perhaps need to move again 🤔
Obviously, the stats will be fixed up follow up iterations when connection is deleted.