From f2f116cb6febd6513263a5d92c8262485cc97848 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 22 Feb 2021 11:07:56 -0800 Subject: [PATCH 1/3] test/e2e/upgrade/alert: Extend to testDuration, not just 1m We're doing better in updates now, and want to ratchet down to bar critical-alert noise during updates. The old 1m alertPeriodCheckMinutes landed with this test in 3b8cb3ca9b (Add CI test to check for crit alerts post upgrade, 2020-03-27, #24786). DurationSinceStartInSeconds, which I'm using now, landed in ace1345c00 (test: Allow tests that check invariants over time to be constrained, 2021-01-06, #25784). --- test/e2e/upgrade/alert/alert.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go index 800feb9505ce..1f1f5bae168a 100644 --- a/test/e2e/upgrade/alert/alert.go +++ b/test/e2e/upgrade/alert/alert.go @@ -21,9 +21,6 @@ const ( // Delay after upgrade is complete before checking for critical alerts alertCheckSleepMinutes = 5 alertCheckSleep = alertCheckSleepMinutes * time.Minute - - // Previous period in which to check for critical alerts - alertPeriodCheckMinutes = 1 ) // UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing. @@ -84,8 +81,11 @@ func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade // period by verifying Watchdog alert has been in firing state watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes) + // we only consider series sent since the beginning of the test + testDuration := exutil.DurationSinceStartInSeconds().String() + // Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes. - criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes) + criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%ds]) >= 1`, testDuration) tests := map[string]bool{ watchdogQuery: true, From 0567445bdf99a6c0054e859f38dc0d758ca63c0f Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 22 Feb 2021 11:12:43 -0800 Subject: [PATCH 2/3] test/e2e/upgrade/alert: Drop alertname filter We don't want any critical alerts firing. Watchdog is severity=none [1]. AlertmanagerReceiversNotConfigured is severity=warning [2]. KubeAPILatencyHigh was dropped in openshift/cluster-monitoring-operator#898, 2020-08-03, and was severity=warning anyway. [1]: https://github.com/openshift/cluster-monitoring-operator/blob/776379a9616be1cbea49dd86086d8be8230370ce/assets/prometheus-k8s/rules.yaml#L2326-L2336 [2]: https://github.com/openshift/cluster-monitoring-operator/blob/776379a9616be1cbea49dd86086d8be8230370ce/assets/prometheus-k8s/rules.yaml#L924-L933 --- test/e2e/upgrade/alert/alert.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go index 1f1f5bae168a..3b277fdae692 100644 --- a/test/e2e/upgrade/alert/alert.go +++ b/test/e2e/upgrade/alert/alert.go @@ -85,7 +85,7 @@ func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade testDuration := exutil.DurationSinceStartInSeconds().String() // Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes. - criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%ds]) >= 1`, testDuration) + criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",severity="critical"}[%ds]) >= 1`, testDuration) tests := map[string]bool{ watchdogQuery: true, From 52495842a1fbd8be7afb053a65d5df7773d5d617 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 22 Feb 2021 11:17:31 -0800 Subject: [PATCH 3/3] test/extended/prometheus: Drop KubeAPILatencyHigh exclusion KubeAPILatencyHigh was dropped in openshift/cluster-monitoring-operator#898, 2020-08-03, 4.6 [1]. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1846805#c13 --- test/extended/prometheus/prometheus.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index 2d6bda4966d7..b65b2fd7c74a 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -68,7 +68,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { // Checking Watchdog alert state is done in "should have a Watchdog alert in firing state". // TODO: remove KubePodCrashLooping subtraction logic once https://bugzilla.redhat.com/show_bug.cgi?id=1842002 // is fixed, but for now we are ignoring KubePodCrashLooping alerts in the openshift-kube-controller-manager namespace. - fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity!="info"}[%[1]s]) - count_over_time(ALERTS{alertname="KubePodCrashLooping",namespace="openshift-kube-controller-manager",alertstate="firing",severity!="info"}[%[1]s]) >= 1`, testDuration): false, + fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured",alertstate="firing",severity!="info"}[%[1]s]) - count_over_time(ALERTS{alertname="KubePodCrashLooping",namespace="openshift-kube-controller-manager",alertstate="firing",severity!="info"}[%[1]s]) >= 1`, testDuration): false, } err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) o.Expect(err).NotTo(o.HaveOccurred())