From 333d91371a1835f499073208dbb712467921aea5 Mon Sep 17 00:00:00 2001
From: Jamo Luhrsen <jluhrsen@gmail.com>
Date: Mon, 17 Jan 2022 14:55:44 -0800
Subject: [PATCH] Flake failed sandboxes from bug in new guard pods

more info in https://bugzilla.redhat.com/show_bug.cgi?id=2038481

essentially, two new guard pods are being started in 4.10+ and
are incorrectly being restarted on a cordoned node and when the
node is rebooted those pods fail to set up sandboxes right away
before the underlying network config file is present. This can
be reverted when the PR to fix this is merged:
  https://github.com/openshift/library-go/pull/1287

Signed-off-by: Jamo Luhrsen <jluhrsen@gmail.com>
---
 pkg/synthetictests/networking.go | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pkg/synthetictests/networking.go b/pkg/synthetictests/networking.go
index 01d07580f8ca..19521aea0d1f 100644
--- a/pkg/synthetictests/networking.go
+++ b/pkg/synthetictests/networking.go
@@ -70,12 +70,16 @@ func testPodSandboxCreation(events monitorapi.Intervals) []*junitapi.JUnitTestCa
 			}
 		} else {
 			timeBetweenDeleteAndFailure := event.From.Sub(*deletionTime)
+			nodeRebootTime := getNodeRebootTime(events, monitorapi.LocatorParts(event.Locator)["node"])
 			switch {
 			case timeBetweenDeleteAndFailure < 1*time.Second:
 				// nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know
 			case timeBetweenDeleteAndFailure < 5*time.Second:
 				// withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail
 				flakes = append(flakes, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator, timeBetweenDeleteAndFailure.Seconds(), event.Message))
+			case nodeRebootTime != nil && nodeRebootTime.After(*deletionTime) && strings.Contains(event.Locator, "guard"):
+				flakes = append(flakes, fmt.Sprintf("the deletion time %v came before a reboot at %v and error is coming "+
+					"from a guard pod. see https://bugzilla.redhat.com/show_bug.cgi?id=2038481", deletionTime, nodeRebootTime))
 			case deletionTime.Before(event.From):
 				// something went wrong.  More than five seconds after the pod ws deleted, the CNI is trying to set up pod sandboxes and can't
 				failures = append(failures, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator, timeBetweenDeleteAndFailure.Seconds(), event.Message))
@@ -184,3 +188,14 @@ func getPodDeletionTime(events monitorapi.Intervals, podLocator string) *time.Ti
 	}
 	return nil
 }
+
+func getNodeRebootTime(events monitorapi.Intervals, node string) *time.Time {
+
+	for _, event := range events {
+		eventNodeName, _ := monitorapi.NodeFromLocator(event.Locator)
+		if eventNodeName == node && strings.Contains(event.Message, "reason/Rebooted") {
+			return &event.From
+		}
+	}
+	return nil
+}