diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go index 800feb9505ce..3b277fdae692 100644 --- a/test/e2e/upgrade/alert/alert.go +++ b/test/e2e/upgrade/alert/alert.go @@ -21,9 +21,6 @@ const ( // Delay after upgrade is complete before checking for critical alerts alertCheckSleepMinutes = 5 alertCheckSleep = alertCheckSleepMinutes * time.Minute - - // Previous period in which to check for critical alerts - alertPeriodCheckMinutes = 1 ) // UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing. @@ -84,8 +81,11 @@ func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade // period by verifying Watchdog alert has been in firing state watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes) + // we only consider series sent since the beginning of the test + testDuration := exutil.DurationSinceStartInSeconds().String() + // Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes. - criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes) + criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",severity="critical"}[%ds]) >= 1`, testDuration) tests := map[string]bool{ watchdogQuery: true, diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index 2d6bda4966d7..b65b2fd7c74a 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -68,7 +68,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { // Checking Watchdog alert state is done in "should have a Watchdog alert in firing state". // TODO: remove KubePodCrashLooping subtraction logic once https://bugzilla.redhat.com/show_bug.cgi?id=1842002 // is fixed, but for now we are ignoring KubePodCrashLooping alerts in the openshift-kube-controller-manager namespace. - fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity!="info"}[%[1]s]) - count_over_time(ALERTS{alertname="KubePodCrashLooping",namespace="openshift-kube-controller-manager",alertstate="firing",severity!="info"}[%[1]s]) >= 1`, testDuration): false, + fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured",alertstate="firing",severity!="info"}[%[1]s]) - count_over_time(ALERTS{alertname="KubePodCrashLooping",namespace="openshift-kube-controller-manager",alertstate="firing",severity!="info"}[%[1]s]) >= 1`, testDuration): false, } err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) o.Expect(err).NotTo(o.HaveOccurred())