From f2f116cb6febd6513263a5d92c8262485cc97848 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Mon, 22 Feb 2021 11:07:56 -0800
Subject: [PATCH 1/3] test/e2e/upgrade/alert: Extend to testDuration, not just
 1m

We're doing better in updates now, and want to ratchet down to bar
critical-alert noise during updates.  The old 1m
alertPeriodCheckMinutes landed with this test in 3b8cb3ca9b (Add CI
test to check for crit alerts post upgrade, 2020-03-27, #24786).

DurationSinceStartInSeconds, which I'm using now, landed in ace1345c00
(test: Allow tests that check invariants over time to be constrained,
2021-01-06, #25784).
---
 test/e2e/upgrade/alert/alert.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go
index 800feb9505ce..1f1f5bae168a 100644
--- a/test/e2e/upgrade/alert/alert.go
+++ b/test/e2e/upgrade/alert/alert.go
@@ -21,9 +21,6 @@ const (
 	// Delay after upgrade is complete before checking for critical alerts
 	alertCheckSleepMinutes = 5
 	alertCheckSleep        = alertCheckSleepMinutes * time.Minute
-
-	// Previous period in which to check for critical alerts
-	alertPeriodCheckMinutes = 1
 )
 
 // UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing.
@@ -84,8 +81,11 @@ func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade
 	// period by verifying Watchdog alert has been in firing state
 	watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes)
 
+	// we only consider series sent since the beginning of the test
+	testDuration := exutil.DurationSinceStartInSeconds().String()
+
 	// Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes.
-	criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes)
+	criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%ds]) >= 1`, testDuration)
 
 	tests := map[string]bool{
 		watchdogQuery:      true,

From 0567445bdf99a6c0054e859f38dc0d758ca63c0f Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Mon, 22 Feb 2021 11:12:43 -0800
Subject: [PATCH 2/3] test/e2e/upgrade/alert: Drop alertname filter

We don't want any critical alerts firing.  Watchdog is severity=none
[1].  AlertmanagerReceiversNotConfigured is severity=warning [2].
KubeAPILatencyHigh was dropped in
openshift/cluster-monitoring-operator#898, 2020-08-03, and was
severity=warning anyway.

[1]: https://github.com/openshift/cluster-monitoring-operator/blob/776379a9616be1cbea49dd86086d8be8230370ce/assets/prometheus-k8s/rules.yaml#L2326-L2336
[2]: https://github.com/openshift/cluster-monitoring-operator/blob/776379a9616be1cbea49dd86086d8be8230370ce/assets/prometheus-k8s/rules.yaml#L924-L933
---
 test/e2e/upgrade/alert/alert.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go
index 1f1f5bae168a..3b277fdae692 100644
--- a/test/e2e/upgrade/alert/alert.go
+++ b/test/e2e/upgrade/alert/alert.go
@@ -85,7 +85,7 @@ func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade
 	testDuration := exutil.DurationSinceStartInSeconds().String()
 
 	// Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes.
-	criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%ds]) >= 1`, testDuration)
+	criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",severity="critical"}[%ds]) >= 1`, testDuration)
 
 	tests := map[string]bool{
 		watchdogQuery:      true,

From 52495842a1fbd8be7afb053a65d5df7773d5d617 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Mon, 22 Feb 2021 11:17:31 -0800
Subject: [PATCH 3/3] test/extended/prometheus: Drop KubeAPILatencyHigh
 exclusion

KubeAPILatencyHigh was dropped in
openshift/cluster-monitoring-operator#898, 2020-08-03, 4.6 [1].

[1]: https://bugzilla.redhat.com/show_bug.cgi?id=1846805#c13
---
 test/extended/prometheus/prometheus.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go
index 2d6bda4966d7..b65b2fd7c74a 100644
--- a/test/extended/prometheus/prometheus.go
+++ b/test/extended/prometheus/prometheus.go
@@ -68,7 +68,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
 			// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
 			// TODO: remove KubePodCrashLooping subtraction logic once https://bugzilla.redhat.com/show_bug.cgi?id=1842002
 			// is fixed, but for now we are ignoring KubePodCrashLooping alerts in the openshift-kube-controller-manager namespace.
-			fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity!="info"}[%[1]s]) - count_over_time(ALERTS{alertname="KubePodCrashLooping",namespace="openshift-kube-controller-manager",alertstate="firing",severity!="info"}[%[1]s]) >= 1`, testDuration): false,
+			fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured",alertstate="firing",severity!="info"}[%[1]s]) - count_over_time(ALERTS{alertname="KubePodCrashLooping",namespace="openshift-kube-controller-manager",alertstate="firing",severity!="info"}[%[1]s]) >= 1`, testDuration): false,
 		}
 		err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 		o.Expect(err).NotTo(o.HaveOccurred())