From 333d91371a1835f499073208dbb712467921aea5 Mon Sep 17 00:00:00 2001 From: Jamo Luhrsen Date: Mon, 17 Jan 2022 14:55:44 -0800 Subject: [PATCH] Flake failed sandboxes from bug in new guard pods more info in https://bugzilla.redhat.com/show_bug.cgi?id=2038481 essentially, two new guard pods are being started in 4.10+ and are incorrectly being restarted on a cordoned node and when the node is rebooted those pods fail to set up sandboxes right away before the underlying network config file is present. This can be reverted when the PR to fix this is merged: https://github.com/openshift/library-go/pull/1287 Signed-off-by: Jamo Luhrsen --- pkg/synthetictests/networking.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pkg/synthetictests/networking.go b/pkg/synthetictests/networking.go index 01d07580f8ca..19521aea0d1f 100644 --- a/pkg/synthetictests/networking.go +++ b/pkg/synthetictests/networking.go @@ -70,12 +70,16 @@ func testPodSandboxCreation(events monitorapi.Intervals) []*junitapi.JUnitTestCa } } else { timeBetweenDeleteAndFailure := event.From.Sub(*deletionTime) + nodeRebootTime := getNodeRebootTime(events, monitorapi.LocatorParts(event.Locator)["node"]) switch { case timeBetweenDeleteAndFailure < 1*time.Second: // nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know case timeBetweenDeleteAndFailure < 5*time.Second: // withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail flakes = append(flakes, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator, timeBetweenDeleteAndFailure.Seconds(), event.Message)) + case nodeRebootTime != nil && nodeRebootTime.After(*deletionTime) && strings.Contains(event.Locator, "guard"): + flakes = append(flakes, fmt.Sprintf("the deletion time %v came before a reboot at %v and error is coming "+ + "from a guard pod. see https://bugzilla.redhat.com/show_bug.cgi?id=2038481", deletionTime, nodeRebootTime)) case deletionTime.Before(event.From): // something went wrong. More than five seconds after the pod ws deleted, the CNI is trying to set up pod sandboxes and can't failures = append(failures, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator, timeBetweenDeleteAndFailure.Seconds(), event.Message)) @@ -184,3 +188,14 @@ func getPodDeletionTime(events monitorapi.Intervals, podLocator string) *time.Ti } return nil } + +func getNodeRebootTime(events monitorapi.Intervals, node string) *time.Time { + + for _, event := range events { + eventNodeName, _ := monitorapi.NodeFromLocator(event.Locator) + if eventNodeName == node && strings.Contains(event.Message, "reason/Rebooted") { + return &event.From + } + } + return nil +}