From 4bc8115cb4b78b78b97414118b33232fb9400e75 Mon Sep 17 00:00:00 2001
From: Clayton Coleman <ccoleman@redhat.com>
Date: Tue, 23 Feb 2021 20:03:37 -0500
Subject: [PATCH 1/2] test: Prometheus helper should use thanos-querier

This ensures that default queries aggregate results from the running
instances (during upgrades) and helps us test invariants like "there
is always an alert firing even during upgrades". Note that this makes
reconstructing a test more complex because we can no longer exactly
reproduce a query (by downloading the prometheus data from a job) but
that was already true because we would get round robined to one of the
instances and some data is implicitly missing.
---
 test/e2e/upgrade/alert/alert.go               |  3 ++-
 test/extended/machines/cluster.go             |  2 +-
 test/extended/prometheus/prometheus.go        | 22 ++++++++++------
 test/extended/prometheus/prometheus_builds.go |  2 +-
 test/extended/prometheus/storage.go           |  2 +-
 test/extended/util/prometheus/helpers.go      | 25 ++++++-------------
 6 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go
index 800feb9505ce..4f205d6696f3 100644
--- a/test/e2e/upgrade/alert/alert.go
+++ b/test/e2e/upgrade/alert/alert.go
@@ -42,7 +42,8 @@ func (UpgradeTest) DisplayName() string {
 func (t *UpgradeTest) Setup(f *framework.Framework) {
 	g.By("Setting up post-upgrade alert test")
 
-	url, bearerToken, oc, ok := helper.ExpectPrometheus(f)
+	oc := exutil.NewCLIWithFramework(f)
+	url, _, bearerToken, ok := helper.LocatePrometheus(oc)
 	if !ok {
 		framework.Failf("Prometheus could not be located on this cluster, failing test %s", t.Name())
 	}
diff --git a/test/extended/machines/cluster.go b/test/extended/machines/cluster.go
index 4dd78b6577c8..5e6cd1866e41 100644
--- a/test/extended/machines/cluster.go
+++ b/test/extended/machines/cluster.go
@@ -66,7 +66,7 @@ var _ = g.Describe("[sig-node] Managed cluster", func() {
 	)
 	g.BeforeEach(func() {
 		var ok bool
-		url, bearerToken, ok = prometheus.LocatePrometheus(oc)
+		url, _, bearerToken, ok = prometheus.LocatePrometheus(oc)
 		if !ok {
 			e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
 		}
diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go
index 00836c8b0170..9a6acbb0b917 100644
--- a/test/extended/prometheus/prometheus.go
+++ b/test/extended/prometheus/prometheus.go
@@ -45,7 +45,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
 	)
 	g.BeforeEach(func() {
 		var ok bool
-		url, bearerToken, ok = helper.LocatePrometheus(oc)
+		url, _, bearerToken, ok = helper.LocatePrometheus(oc)
 		if !ok {
 			e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
 		}
@@ -133,12 +133,12 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 	var (
 		oc = exutil.NewCLIWithoutNamespace("prometheus")
 
-		url, bearerToken string
+		url, prometheusURL, bearerToken string
 	)
 
 	g.BeforeEach(func() {
 		var ok bool
-		url, bearerToken, ok = helper.LocatePrometheus(oc)
+		url, prometheusURL, bearerToken, ok = helper.LocatePrometheus(oc)
 		if !ok {
 			e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
 		}
@@ -183,7 +183,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			g.By("checking the prometheus metrics path")
 			var metrics map[string]*dto.MetricFamily
 			o.Expect(wait.PollImmediate(10*time.Second, 2*time.Minute, func() (bool, error) {
-				results, err := getInsecureURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/metrics", url))
+				results, err := getInsecureURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/metrics", prometheusURL))
 				if err != nil {
 					e2e.Logf("unable to get metrics: %v", err)
 					return false, nil
@@ -222,7 +222,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			// expect all endpoints within 60 seconds
 			var lastErrs []error
 			o.Expect(wait.PollImmediate(10*time.Second, 2*time.Minute, func() (bool, error) {
-				contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", url), bearerToken)
+				contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", prometheusURL), bearerToken)
 				o.Expect(err).NotTo(o.HaveOccurred())
 
 				targets := &prometheusTargets{}
@@ -269,7 +269,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 
 			g.By("verifying all targets are exposing metrics over secure channel")
 			var insecureTargets []error
-			contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", url), bearerToken)
+			contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", prometheusURL), bearerToken)
 			o.Expect(err).NotTo(o.HaveOccurred())
 
 			targets := &prometheusTargets{}
@@ -299,6 +299,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			}
 			o.Expect(insecureTargets).To(o.BeEmpty(), "some services expose metrics over insecure channel")
 		})
+
 		g.It("should have a AlertmanagerReceiversNotConfigured alert in firing state", func() {
 			ns := oc.SetupNamespace()
 			execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod")
@@ -314,6 +315,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 
 			e2e.Logf("AlertmanagerReceiversNotConfigured alert is firing")
 		})
+
 		g.It("should have important platform topology metrics", func() {
 			oc.SetupProject()
 			ns := oc.Namespace()
@@ -340,6 +342,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 			o.Expect(err).NotTo(o.HaveOccurred())
 		})
+
 		g.It("should have non-Pod host cAdvisor metrics", func() {
 			ns := oc.SetupNamespace()
 			execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod")
@@ -353,6 +356,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 			o.Expect(err).NotTo(o.HaveOccurred())
 		})
+
 		g.It("shouldn't have failing rules evaluation", func() {
 			oc.SetupProject()
 			ns := oc.Namespace()
@@ -370,6 +374,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 			o.Expect(err).NotTo(o.HaveOccurred())
 		})
+
 		networking.InOpenShiftSDNContext(func() {
 			g.It("should be able to get the sdn ovs flows", func() {
 				ns := oc.SetupNamespace()
@@ -386,6 +391,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 				o.Expect(err).NotTo(o.HaveOccurred())
 			})
 		})
+
 		g.It("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early]", func() {
 			if len(os.Getenv("TEST_UNSUPPORTED_ALLOW_VERSION_SKEW")) > 0 {
 				e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts")
@@ -403,6 +409,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 			o.Expect(err).NotTo(o.HaveOccurred())
 		})
+
 		g.It("should provide ingress metrics", func() {
 			ns := oc.SetupNamespace()
 
@@ -413,7 +420,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 
 			var lastErrs []error
 			o.Expect(wait.PollImmediate(10*time.Second, 4*time.Minute, func() (bool, error) {
-				contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", url), bearerToken)
+				contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", prometheusURL), bearerToken)
 				o.Expect(err).NotTo(o.HaveOccurred())
 
 				targets := &prometheusTargets{}
@@ -440,6 +447,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			err := helper.RunQueries(queries, oc, ns, execPod.Name, url, bearerToken)
 			o.Expect(err).NotTo(o.HaveOccurred())
 		})
+
 		g.It("should provide named network metrics", func() {
 			ns := oc.SetupNamespace()
 
diff --git a/test/extended/prometheus/prometheus_builds.go b/test/extended/prometheus/prometheus_builds.go
index 5be6d0927750..1b7dd0e89cc3 100644
--- a/test/extended/prometheus/prometheus_builds.go
+++ b/test/extended/prometheus/prometheus_builds.go
@@ -26,7 +26,7 @@ var _ = g.Describe("[sig-instrumentation][sig-builds][Feature:Builds] Prometheus
 	)
 	g.BeforeEach(func() {
 		var ok bool
-		url, bearerToken, ok = helper.LocatePrometheus(oc)
+		url, _, bearerToken, ok = helper.LocatePrometheus(oc)
 		if !ok {
 			e2eskipper.Skipf("Prometheus could not be located on this cluster, skipping prometheus test")
 		}
diff --git a/test/extended/prometheus/storage.go b/test/extended/prometheus/storage.go
index 492ba57bcc1e..84f5f46244ec 100644
--- a/test/extended/prometheus/storage.go
+++ b/test/extended/prometheus/storage.go
@@ -29,7 +29,7 @@ var _ = g.Describe("[sig-storage][Late] Metrics", func() {
 
 	g.BeforeEach(func() {
 		var ok bool
-		url, bearerToken, ok = helper.LocatePrometheus(oc)
+		url, _, bearerToken, ok = helper.LocatePrometheus(oc)
 		if !ok {
 			e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
 		}
diff --git a/test/extended/util/prometheus/helpers.go b/test/extended/util/prometheus/helpers.go
index f47ed59aa8db..47cc7695d898 100644
--- a/test/extended/util/prometheus/helpers.go
+++ b/test/extended/util/prometheus/helpers.go
@@ -62,11 +62,11 @@ func waitForServiceAccountInNamespace(c clientset.Interface, ns, serviceAccountN
 	return err
 }
 
-// LocatePrometheus uses an exisitng CLI to return information used to make http requests to Prometheus
-func LocatePrometheus(oc *exutil.CLI) (url, bearerToken string, ok bool) {
+// LocatePrometheus uses an exisitng CLI to return information used to make http requests to Prometheus.
+func LocatePrometheus(oc *exutil.CLI) (queryURL, prometheusURL, bearerToken string, ok bool) {
 	_, err := oc.AdminKubeClient().CoreV1().Services("openshift-monitoring").Get(context.Background(), "prometheus-k8s", metav1.GetOptions{})
 	if kapierrs.IsNotFound(err) {
-		return "", "", false
+		return "", "", "", false
 	}
 
 	waitForServiceAccountInNamespace(oc.AdminKubeClient(), "openshift-monitoring", "prometheus-k8s", 2*time.Minute)
@@ -91,18 +91,7 @@ func LocatePrometheus(oc *exutil.CLI) (url, bearerToken string, ok bool) {
 	}
 	o.Expect(bearerToken).ToNot(o.BeEmpty())
 
-	return "https://prometheus-k8s.openshift-monitoring.svc:9091", bearerToken, true
-}
-
-// ExpectPrometheus uses an existing framework to return information used to make http requests to Prometheus
-func ExpectPrometheus(f *framework.Framework) (url, bearerToken string, oc *exutil.CLI, ok bool) {
-
-	// Must use version that's can run within Ginkgo It
-	oc = exutil.NewCLIWithFramework(f)
-
-	url, bearerToken, ok = LocatePrometheus(oc)
-
-	return url, bearerToken, oc, ok
+	return "https://thanos-querier.openshift-monitoring.svc:9091", "https://prometheus-k8s.openshift-monitoring.svc:9091", bearerToken, true
 }
 
 // RunQueries executes Prometheus queries and checks provided expected result.
@@ -133,7 +122,7 @@ func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, ba
 			}
 			metrics := result.Data.Result
 			if result.Status != "success" {
-				data, _ := json.Marshal(metrics)
+				data, _ := json.MarshalIndent(metrics, "", "  ")
 				msg := fmt.Sprintf("promQL query: %s had reported incorrect status:\n%s", query, data)
 				if prev, ok := queryErrors[query]; !ok || prev.Error() != msg {
 					framework.Logf("%s", msg)
@@ -142,7 +131,7 @@ func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, ba
 				continue
 			}
 			if (len(metrics) > 0 && !expected) || (len(metrics) == 0 && expected) {
-				data, _ := json.Marshal(metrics)
+				data, _ := json.MarshalIndent(metrics, "", "  ")
 				msg := fmt.Sprintf("promQL query: %s had reported incorrect results:\n%s", query, data)
 				if prev, ok := queryErrors[query]; !ok || prev.Error() != msg {
 					framework.Logf("%s", msg)
@@ -163,7 +152,7 @@ func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, ba
 	}
 
 	if len(queryErrors) != 0 {
-		exutil.DumpPodLogsStartingWith("prometheus-0", oc)
+		exutil.DumpPodLogsStartingWith("prometheus-k8s-", oc)
 	}
 	var errs []error
 	for query, err := range queryErrors {

From 59ef04b75480dd72636133faca63b00e78bfac0b Mon Sep 17 00:00:00 2001
From: Clayton Coleman <ccoleman@redhat.com>
Date: Wed, 17 Feb 2021 18:00:00 -0500
Subject: [PATCH 2/2] test: Require no alerts during upgrades

It is unacceptable to fire alerts during normal upgrades. Tighten
the constraints on the upgrade tests to ensure that any alerts
fail the upgrade test. If the test is skipped, continue to report
which alerts fired. Test over the entire duration of the test like
we do in the post-run variation.
---
 test/e2e/upgrade/alert/alert.go          | 211 ++++++++++++++++++-----
 test/extended/util/prometheus/helpers.go |  56 +++---
 2 files changed, 200 insertions(+), 67 deletions(-)

diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go
index 4f205d6696f3..cb0a4e8d1273 100644
--- a/test/e2e/upgrade/alert/alert.go
+++ b/test/e2e/upgrade/alert/alert.go
@@ -3,44 +3,39 @@ package alert
 import (
 	"context"
 	"fmt"
+	"strings"
 	"time"
 
 	g "github.com/onsi/ginkgo"
 	o "github.com/onsi/gomega"
+	"github.com/prometheus/common/model"
 
 	exutil "github.com/openshift/origin/test/extended/util"
+	"github.com/openshift/origin/test/extended/util/disruption"
 	helper "github.com/openshift/origin/test/extended/util/prometheus"
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/kubernetes/test/e2e/framework"
-	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 	"k8s.io/kubernetes/test/e2e/upgrades"
 )
 
-const (
-	// Delay after upgrade is complete before checking for critical alerts
-	alertCheckSleepMinutes = 5
-	alertCheckSleep        = alertCheckSleepMinutes * time.Minute
-
-	// Previous period in which to check for critical alerts
-	alertPeriodCheckMinutes = 1
-)
-
-// UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing.
+// UpgradeTest runs verifies invariants regarding what alerts are allowed to fire
+// during the upgrade process.
 type UpgradeTest struct {
 	url         string
 	bearerToken string
 	oc          *exutil.CLI
 }
 
-func (UpgradeTest) Name() string { return "check-for-critical-alerts" }
+func (UpgradeTest) Name() string { return "check-for-alerts" }
 func (UpgradeTest) DisplayName() string {
-	return "[sig-arch] Check if critical alerts are firing after upgrade success"
+	return "[sig-arch] Check if alerts are firing during or after upgrade success"
 }
 
 // Setup creates parameters to query Prometheus
 func (t *UpgradeTest) Setup(f *framework.Framework) {
-	g.By("Setting up post-upgrade alert test")
+	g.By("Setting up upgrade alert test")
 
 	oc := exutil.NewCLIWithFramework(f)
 	url, _, bearerToken, ok := helper.LocatePrometheus(oc)
@@ -53,53 +48,183 @@ func (t *UpgradeTest) Setup(f *framework.Framework) {
 	framework.Logf("Post-upgrade alert test setup complete")
 }
 
-// Test checks if any critical alerts are firing.
+type MetricCondition struct {
+	Selector map[string]string
+	Text     string
+}
+
+type MetricConditions []MetricCondition
+
+func (c MetricConditions) Matches(sample *model.Sample) *MetricCondition {
+	for i, condition := range c {
+		matches := true
+		for name, value := range condition.Selector {
+			if sample.Metric[model.LabelName(name)] != model.LabelValue(value) {
+				matches = false
+				break
+			}
+			if matches {
+				return &c[i]
+			}
+		}
+	}
+	return nil
+}
+
+func stripLabels(m model.Metric, names ...string) model.LabelSet {
+	labels := make(model.LabelSet)
+	for k := range m {
+		labels[k] = m[k]
+	}
+	for _, name := range names {
+		delete(labels, model.LabelName(name))
+	}
+	return labels
+}
+
+// Test checks if alerts are firing at various points during upgrade.
+// An alert firing during an upgrade is a high severity bug - it either points to a real issue in
+// a dependency, or a failure of the component, and therefore must be fixed.
 func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade upgrades.UpgradeType) {
-	g.By("Checking for critical alerts")
+	tolerateDuringSkew := exutil.TolerateVersionSkewInTests()
+	firingAlertsWithBugs := MetricConditions{
+		{
+			Selector: map[string]string{"alertname": "KubePodNotReady", "namespace": "openshift-kube-apiserver-operator"},
+			Text:     "https://bugzilla.redhat.com/show_bug.cgi?id=1939580",
+		},
+		{
+			Selector: map[string]string{"alertname": "ClusterOperatorDegraded", "name": "openshift-apiserver"},
+			Text:     "https://bugzilla.redhat.com/show_bug.cgi?id=1939580",
+		},
+		{
+			Selector: map[string]string{"alertname": "ClusterOperatorDown", "name": "authentication"},
+			Text:     "https://bugzilla.redhat.com/show_bug.cgi?id=1939580",
+		},
+		{
+			Selector: map[string]string{"alertname": "ClusterOperatorDegraded", "name": "authentication"},
+			Text:     "https://bugzilla.redhat.com/show_bug.cgi?id=1939580",
+		},
+
+		{
+			Selector: map[string]string{"alertname": "FailingOperator", "name": "packageserver", "job": "olm-operator-metrics"},
+			Text:     "https://bugzilla.redhat.com/show_bug.cgi?id=1932626",
+		},
+		{
+			Selector: map[string]string{"alertname": "ThanosSidecarUnhealthy", "job": "prometheus-k8s-thanos-sidecar"},
+			Text:     "https://bugzilla.redhat.com/show_bug.cgi?id=1940262",
+		},
+	}
+
+	pendingAlertsWithBugs := MetricConditions{
+		{
+			Selector: map[string]string{"alertname": "ClusterMonitoringOperatorReconciliationErrors"},
+			Text:     "https://bugzilla.redhat.com/show_bug.cgi?id=1932624",
+		},
+		{
+			Selector: map[string]string{"alertname": "KubeClientErrors"},
+			Text:     "https://bugzilla.redhat.com/show_bug.cgi?id=1925698",
+		},
+	}
+	allowedPendingAlerts := MetricConditions{
+		{
+			Selector: map[string]string{"alertname": "etcdMemberCommunicationSlow"},
+			Text:     "Excluded because it triggers during upgrade (detects ~5m of high latency immediately preceeding the end of the test), and we don't want to change the alert because it is correct",
+		},
+	}
+
+	knownViolations := sets.NewString()
+	unexpectedViolations := sets.NewString()
+	unexpectedViolationsAsFlakes := sets.NewString()
+	debug := sets.NewString()
+
+	g.By("Checking for alerts")
 
-	// Recover current test if it fails so test suite can complete
-	defer g.GinkgoRecover()
+	start := time.Now()
 
 	// Block until upgrade is done
-	g.By("Waiting for upgrade to finish before checking for critical alerts")
+	g.By("Waiting for upgrade to finish before checking for alerts")
 	<-done
 
-	ctx, cancel := context.WithCancel(context.Background())
+	// Additonal delay after upgrade completion to allow pending alerts to settle
+	g.By("Waiting before checking for alerts")
+	time.Sleep(1 * time.Minute)
 
-	// Additonal delay after upgrade completion
-	g.By("Waiting before checking for critical alerts")
-	time.Sleep(alertCheckSleep)
-	cancel()
-
-	if exutil.TolerateVersionSkewInTests() {
-		e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts")
-	}
-	t.oc.SetupProject()
-	ns := t.oc.Namespace()
+	ns := t.oc.SetupNamespace()
 	execPod := exutil.CreateExecPodOrFail(t.oc.AdminKubeClient(), ns, "execpod")
 	defer func() {
-		t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(ctx, execPod.Name, *metav1.NewDeleteOptions(1))
+		t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(context.Background(), execPod.Name, *metav1.NewDeleteOptions(1))
 	}()
 
-	// Query to check if Prometheus has been up and running for entire post-upgrade
-	// period by verifying Watchdog alert has been in firing state
-	watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes)
-
-	// Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes.
-	criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes)
+	testDuration := time.Now().Sub(start).Truncate(time.Second)
+
+	// Invariant: The watchdog alert should be firing continuously during the whole upgrade via the thanos
+	// querier (which should have no gaps when it queries the individual stores). Allow zero or one changes
+	// to the presence of this series (zero if data is preserved over upgrade, one if data is lost on upgrade).
+	// This would not catch the alert stopping firing, but we catch that in other places and tests.
+	watchdogQuery := fmt.Sprintf(`changes((max((ALERTS{alertstate="firing",alertname="Watchdog",severity="none"}) or (absent(ALERTS{alertstate="firing",alertname="Watchdog",severity="none"})*0)))[%s:1s]) > 1`, testDuration)
+	result, err := helper.RunQuery(watchdogQuery, ns, execPod.Name, t.url, t.bearerToken)
+	o.Expect(err).NotTo(o.HaveOccurred(), "unable to check watchdog alert over upgrade window")
+	if len(result.Data.Result) > 0 {
+		unexpectedViolations.Insert("Watchdog alert had missing intervals during the run, which may be a sign of a Prometheus outage in violation of the prometheus query SLO of 100% uptime during upgrade")
+	}
 
-	tests := map[string]bool{
-		watchdogQuery:      true,
-		criticalAlertQuery: false,
+	// Invariant: No non-info level alerts should have fired during the upgrade
+	firingAlertQuery := fmt.Sprintf(`
+sort_desc(
+count_over_time(ALERTS{alertstate="firing",severity!="info",alertname!~"Watchdog|AlertmanagerReceiversNotConfigured"}[%[1]s:1s])
+) > 0
+`, testDuration)
+	result, err = helper.RunQuery(firingAlertQuery, ns, execPod.Name, t.url, t.bearerToken)
+	o.Expect(err).NotTo(o.HaveOccurred(), "unable to check firing alerts during upgrade")
+	for _, series := range result.Data.Result {
+		labels := stripLabels(series.Metric, "alertname", "alertstate", "prometheus")
+		violation := fmt.Sprintf("alert %s fired for %s seconds with labels: %s", series.Metric["alertname"], series.Value, labelsAsSelector(labels))
+		if cause := firingAlertsWithBugs.Matches(series); cause != nil {
+			knownViolations.Insert(fmt.Sprintf("%s (open bug: %s)", violation, cause.Text))
+		} else {
+			unexpectedViolations.Insert(violation)
+		}
 	}
 
-	err := helper.RunQueries(tests, t.oc, ns, execPod.Name, t.url, t.bearerToken)
-	o.Expect(err).NotTo(o.HaveOccurred())
+	// Invariant: There should be no pending alerts 1m after the upgrade completes
+	pendingAlertQuery := `ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured",alertstate="pending",severity!="info"}`
+	result, err = helper.RunQuery(pendingAlertQuery, ns, execPod.Name, t.url, t.bearerToken)
+	o.Expect(err).NotTo(o.HaveOccurred(), "unable to retrieve pending alerts after upgrade")
+	for _, series := range result.Data.Result {
+		labels := stripLabels(series.Metric, "alertname", "alertstate", "prometheus")
+		violation := fmt.Sprintf("alert %s pending for %s seconds with labels: %s", series.Metric["alertname"], series.Value, labelsAsSelector(labels))
+		if cause := allowedPendingAlerts.Matches(series); cause != nil {
+			debug.Insert(fmt.Sprintf("%s (allowed: %s)", violation, cause.Text))
+			continue
+		}
+		if cause := pendingAlertsWithBugs.Matches(series); cause != nil {
+			knownViolations.Insert(fmt.Sprintf("%s (open bug: %s)", violation, cause.Text))
+		} else {
+			// treat pending errors as a flake right now because we are still trying to determine the scope
+			// TODO: move this to unexpectedViolations later
+			unexpectedViolationsAsFlakes.Insert(violation)
+		}
+	}
 
-	framework.Logf("No critical alerts firing post-upgrade")
+	if len(debug) > 0 {
+		framework.Logf("Alerts were detected during upgrade which are allowed:\n\n%s", strings.Join(debug.List(), "\n"))
+	}
+	if len(unexpectedViolations) > 0 {
+		if !tolerateDuringSkew {
+			framework.Failf("Unexpected alerts fired or pending during the upgrade:\n\n%s", strings.Join(unexpectedViolations.List(), "\n"))
+		}
+	}
+	if flakes := sets.NewString().Union(knownViolations).Union(unexpectedViolations).Union(unexpectedViolationsAsFlakes); len(flakes) > 0 {
+		disruption.FrameworkFlakef(f, "Unexpected alert behavior during upgrade:\n\n%s", strings.Join(flakes.List(), "\n"))
+	}
+	framework.Logf("No alerts fired during upgrade")
 }
 
 // Teardown cleans up any remaining resources.
 func (t *UpgradeTest) Teardown(f *framework.Framework) {
 	// rely on the namespace deletion to clean up everything
 }
+
+func labelsAsSelector(l model.LabelSet) string {
+	return l.String()
+}
diff --git a/test/extended/util/prometheus/helpers.go b/test/extended/util/prometheus/helpers.go
index 47cc7695d898..2744c857b8cd 100644
--- a/test/extended/util/prometheus/helpers.go
+++ b/test/extended/util/prometheus/helpers.go
@@ -94,6 +94,26 @@ func LocatePrometheus(oc *exutil.CLI) (queryURL, prometheusURL, bearerToken stri
 	return "https://thanos-querier.openshift-monitoring.svc:9091", "https://prometheus-k8s.openshift-monitoring.svc:9091", bearerToken, true
 }
 
+func RunQuery(query, ns, execPodName, baseURL, bearerToken string) (*PrometheusResponse, error) {
+	url := fmt.Sprintf("%s/api/v1/query?%s", baseURL, (url.Values{"query": []string{query}}).Encode())
+	contents, err := GetBearerTokenURLViaPod(ns, execPodName, url, bearerToken)
+	if err != nil {
+		return nil, fmt.Errorf("unable to execute query %s: %v", query, err)
+	}
+
+	// check query result, if this is a new error log it, otherwise remain silent
+	var result PrometheusResponse
+	if err := json.Unmarshal([]byte(contents), &result); err != nil {
+		return nil, fmt.Errorf("unable to parse query response for %s: %v", query, err)
+	}
+	metrics := result.Data.Result
+	if result.Status != "success" {
+		data, _ := json.MarshalIndent(metrics, "", "  ")
+		return nil, fmt.Errorf("promQL query: %s had reported incorrect status:\n%s", query, data)
+	}
+	return &result, nil
+}
+
 // RunQueries executes Prometheus queries and checks provided expected result.
 func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, baseURL, bearerToken string) error {
 	// expect all correct metrics within a reasonable time period
@@ -108,33 +128,21 @@ func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, ba
 			// and introduced at https://github.com/prometheus/client_golang/blob/master/api/prometheus/v1/api.go are vendored into
 			// openshift/origin, look to replace this homegrown http request / query param with that API
 			g.By("perform prometheus metric query " + query)
-			url := fmt.Sprintf("%s/api/v1/query?%s", baseURL, (url.Values{"query": []string{query}}).Encode())
-			contents, err := GetBearerTokenURLViaPod(ns, execPodName, url, bearerToken)
+			result, err := RunQuery(query, ns, execPodName, baseURL, bearerToken)
 			if err != nil {
-				return err
-			}
-
-			// check query result, if this is a new error log it, otherwise remain silent
-			var result PrometheusResponse
-			if err := json.Unmarshal([]byte(contents), &result); err != nil {
-				framework.Logf("unable to parse query response for %s: %v", query, err)
-				continue
-			}
-			metrics := result.Data.Result
-			if result.Status != "success" {
-				data, _ := json.MarshalIndent(metrics, "", "  ")
-				msg := fmt.Sprintf("promQL query: %s had reported incorrect status:\n%s", query, data)
-				if prev, ok := queryErrors[query]; !ok || prev.Error() != msg {
-					framework.Logf("%s", msg)
+				msg := err.Error()
+				if prev, ok := queryErrors[query]; ok && prev.Error() != msg {
+					framework.Logf("%s", prev.Error())
 				}
-				queryErrors[query] = fmt.Errorf(msg)
+				queryErrors[query] = err
 				continue
 			}
+			metrics := result.Data.Result
 			if (len(metrics) > 0 && !expected) || (len(metrics) == 0 && expected) {
-				data, _ := json.MarshalIndent(metrics, "", "  ")
-				msg := fmt.Sprintf("promQL query: %s had reported incorrect results:\n%s", query, data)
-				if prev, ok := queryErrors[query]; !ok || prev.Error() != msg {
-					framework.Logf("%s", msg)
+				data, _ := json.MarshalIndent(result.Data.Result, "", "  ")
+				msg := fmt.Sprintf("promQL query returned unexpected results:\n%s\n%s", query, data)
+				if prev, ok := queryErrors[query]; ok && prev.Error() != msg {
+					framework.Logf("%s", prev.Error())
 				}
 				queryErrors[query] = fmt.Errorf(msg)
 				continue
@@ -155,8 +163,8 @@ func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, ba
 		exutil.DumpPodLogsStartingWith("prometheus-k8s-", oc)
 	}
 	var errs []error
-	for query, err := range queryErrors {
-		errs = append(errs, fmt.Errorf("query failed: %s: %s", query, err))
+	for _, err := range queryErrors {
+		errs = append(errs, err)
 	}
 	return errors.NewAggregate(errs)
 }