diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go index 800feb9505ce..cb0a4e8d1273 100644 --- a/test/e2e/upgrade/alert/alert.go +++ b/test/e2e/upgrade/alert/alert.go @@ -3,46 +3,42 @@ package alert import ( "context" "fmt" + "strings" "time" g "github.com/onsi/ginkgo" o "github.com/onsi/gomega" + "github.com/prometheus/common/model" exutil "github.com/openshift/origin/test/extended/util" + "github.com/openshift/origin/test/extended/util/disruption" helper "github.com/openshift/origin/test/extended/util/prometheus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/kubernetes/test/e2e/framework" - e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" "k8s.io/kubernetes/test/e2e/upgrades" ) -const ( - // Delay after upgrade is complete before checking for critical alerts - alertCheckSleepMinutes = 5 - alertCheckSleep = alertCheckSleepMinutes * time.Minute - - // Previous period in which to check for critical alerts - alertPeriodCheckMinutes = 1 -) - -// UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing. +// UpgradeTest runs verifies invariants regarding what alerts are allowed to fire +// during the upgrade process. type UpgradeTest struct { url string bearerToken string oc *exutil.CLI } -func (UpgradeTest) Name() string { return "check-for-critical-alerts" } +func (UpgradeTest) Name() string { return "check-for-alerts" } func (UpgradeTest) DisplayName() string { - return "[sig-arch] Check if critical alerts are firing after upgrade success" + return "[sig-arch] Check if alerts are firing during or after upgrade success" } // Setup creates parameters to query Prometheus func (t *UpgradeTest) Setup(f *framework.Framework) { - g.By("Setting up post-upgrade alert test") + g.By("Setting up upgrade alert test") - url, bearerToken, oc, ok := helper.ExpectPrometheus(f) + oc := exutil.NewCLIWithFramework(f) + url, _, bearerToken, ok := helper.LocatePrometheus(oc) if !ok { framework.Failf("Prometheus could not be located on this cluster, failing test %s", t.Name()) } @@ -52,53 +48,183 @@ func (t *UpgradeTest) Setup(f *framework.Framework) { framework.Logf("Post-upgrade alert test setup complete") } -// Test checks if any critical alerts are firing. +type MetricCondition struct { + Selector map[string]string + Text string +} + +type MetricConditions []MetricCondition + +func (c MetricConditions) Matches(sample *model.Sample) *MetricCondition { + for i, condition := range c { + matches := true + for name, value := range condition.Selector { + if sample.Metric[model.LabelName(name)] != model.LabelValue(value) { + matches = false + break + } + if matches { + return &c[i] + } + } + } + return nil +} + +func stripLabels(m model.Metric, names ...string) model.LabelSet { + labels := make(model.LabelSet) + for k := range m { + labels[k] = m[k] + } + for _, name := range names { + delete(labels, model.LabelName(name)) + } + return labels +} + +// Test checks if alerts are firing at various points during upgrade. +// An alert firing during an upgrade is a high severity bug - it either points to a real issue in +// a dependency, or a failure of the component, and therefore must be fixed. func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade upgrades.UpgradeType) { - g.By("Checking for critical alerts") + tolerateDuringSkew := exutil.TolerateVersionSkewInTests() + firingAlertsWithBugs := MetricConditions{ + { + Selector: map[string]string{"alertname": "KubePodNotReady", "namespace": "openshift-kube-apiserver-operator"}, + Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1939580", + }, + { + Selector: map[string]string{"alertname": "ClusterOperatorDegraded", "name": "openshift-apiserver"}, + Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1939580", + }, + { + Selector: map[string]string{"alertname": "ClusterOperatorDown", "name": "authentication"}, + Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1939580", + }, + { + Selector: map[string]string{"alertname": "ClusterOperatorDegraded", "name": "authentication"}, + Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1939580", + }, + + { + Selector: map[string]string{"alertname": "FailingOperator", "name": "packageserver", "job": "olm-operator-metrics"}, + Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1932626", + }, + { + Selector: map[string]string{"alertname": "ThanosSidecarUnhealthy", "job": "prometheus-k8s-thanos-sidecar"}, + Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1940262", + }, + } + + pendingAlertsWithBugs := MetricConditions{ + { + Selector: map[string]string{"alertname": "ClusterMonitoringOperatorReconciliationErrors"}, + Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1932624", + }, + { + Selector: map[string]string{"alertname": "KubeClientErrors"}, + Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1925698", + }, + } + allowedPendingAlerts := MetricConditions{ + { + Selector: map[string]string{"alertname": "etcdMemberCommunicationSlow"}, + Text: "Excluded because it triggers during upgrade (detects ~5m of high latency immediately preceeding the end of the test), and we don't want to change the alert because it is correct", + }, + } + + knownViolations := sets.NewString() + unexpectedViolations := sets.NewString() + unexpectedViolationsAsFlakes := sets.NewString() + debug := sets.NewString() + + g.By("Checking for alerts") - // Recover current test if it fails so test suite can complete - defer g.GinkgoRecover() + start := time.Now() // Block until upgrade is done - g.By("Waiting for upgrade to finish before checking for critical alerts") + g.By("Waiting for upgrade to finish before checking for alerts") <-done - ctx, cancel := context.WithCancel(context.Background()) + // Additonal delay after upgrade completion to allow pending alerts to settle + g.By("Waiting before checking for alerts") + time.Sleep(1 * time.Minute) - // Additonal delay after upgrade completion - g.By("Waiting before checking for critical alerts") - time.Sleep(alertCheckSleep) - cancel() - - if exutil.TolerateVersionSkewInTests() { - e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts") - } - t.oc.SetupProject() - ns := t.oc.Namespace() + ns := t.oc.SetupNamespace() execPod := exutil.CreateExecPodOrFail(t.oc.AdminKubeClient(), ns, "execpod") defer func() { - t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(ctx, execPod.Name, *metav1.NewDeleteOptions(1)) + t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(context.Background(), execPod.Name, *metav1.NewDeleteOptions(1)) }() - // Query to check if Prometheus has been up and running for entire post-upgrade - // period by verifying Watchdog alert has been in firing state - watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes) - - // Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes. - criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes) + testDuration := time.Now().Sub(start).Truncate(time.Second) + + // Invariant: The watchdog alert should be firing continuously during the whole upgrade via the thanos + // querier (which should have no gaps when it queries the individual stores). Allow zero or one changes + // to the presence of this series (zero if data is preserved over upgrade, one if data is lost on upgrade). + // This would not catch the alert stopping firing, but we catch that in other places and tests. + watchdogQuery := fmt.Sprintf(`changes((max((ALERTS{alertstate="firing",alertname="Watchdog",severity="none"}) or (absent(ALERTS{alertstate="firing",alertname="Watchdog",severity="none"})*0)))[%s:1s]) > 1`, testDuration) + result, err := helper.RunQuery(watchdogQuery, ns, execPod.Name, t.url, t.bearerToken) + o.Expect(err).NotTo(o.HaveOccurred(), "unable to check watchdog alert over upgrade window") + if len(result.Data.Result) > 0 { + unexpectedViolations.Insert("Watchdog alert had missing intervals during the run, which may be a sign of a Prometheus outage in violation of the prometheus query SLO of 100% uptime during upgrade") + } - tests := map[string]bool{ - watchdogQuery: true, - criticalAlertQuery: false, + // Invariant: No non-info level alerts should have fired during the upgrade + firingAlertQuery := fmt.Sprintf(` +sort_desc( +count_over_time(ALERTS{alertstate="firing",severity!="info",alertname!~"Watchdog|AlertmanagerReceiversNotConfigured"}[%[1]s:1s]) +) > 0 +`, testDuration) + result, err = helper.RunQuery(firingAlertQuery, ns, execPod.Name, t.url, t.bearerToken) + o.Expect(err).NotTo(o.HaveOccurred(), "unable to check firing alerts during upgrade") + for _, series := range result.Data.Result { + labels := stripLabels(series.Metric, "alertname", "alertstate", "prometheus") + violation := fmt.Sprintf("alert %s fired for %s seconds with labels: %s", series.Metric["alertname"], series.Value, labelsAsSelector(labels)) + if cause := firingAlertsWithBugs.Matches(series); cause != nil { + knownViolations.Insert(fmt.Sprintf("%s (open bug: %s)", violation, cause.Text)) + } else { + unexpectedViolations.Insert(violation) + } } - err := helper.RunQueries(tests, t.oc, ns, execPod.Name, t.url, t.bearerToken) - o.Expect(err).NotTo(o.HaveOccurred()) + // Invariant: There should be no pending alerts 1m after the upgrade completes + pendingAlertQuery := `ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured",alertstate="pending",severity!="info"}` + result, err = helper.RunQuery(pendingAlertQuery, ns, execPod.Name, t.url, t.bearerToken) + o.Expect(err).NotTo(o.HaveOccurred(), "unable to retrieve pending alerts after upgrade") + for _, series := range result.Data.Result { + labels := stripLabels(series.Metric, "alertname", "alertstate", "prometheus") + violation := fmt.Sprintf("alert %s pending for %s seconds with labels: %s", series.Metric["alertname"], series.Value, labelsAsSelector(labels)) + if cause := allowedPendingAlerts.Matches(series); cause != nil { + debug.Insert(fmt.Sprintf("%s (allowed: %s)", violation, cause.Text)) + continue + } + if cause := pendingAlertsWithBugs.Matches(series); cause != nil { + knownViolations.Insert(fmt.Sprintf("%s (open bug: %s)", violation, cause.Text)) + } else { + // treat pending errors as a flake right now because we are still trying to determine the scope + // TODO: move this to unexpectedViolations later + unexpectedViolationsAsFlakes.Insert(violation) + } + } - framework.Logf("No critical alerts firing post-upgrade") + if len(debug) > 0 { + framework.Logf("Alerts were detected during upgrade which are allowed:\n\n%s", strings.Join(debug.List(), "\n")) + } + if len(unexpectedViolations) > 0 { + if !tolerateDuringSkew { + framework.Failf("Unexpected alerts fired or pending during the upgrade:\n\n%s", strings.Join(unexpectedViolations.List(), "\n")) + } + } + if flakes := sets.NewString().Union(knownViolations).Union(unexpectedViolations).Union(unexpectedViolationsAsFlakes); len(flakes) > 0 { + disruption.FrameworkFlakef(f, "Unexpected alert behavior during upgrade:\n\n%s", strings.Join(flakes.List(), "\n")) + } + framework.Logf("No alerts fired during upgrade") } // Teardown cleans up any remaining resources. func (t *UpgradeTest) Teardown(f *framework.Framework) { // rely on the namespace deletion to clean up everything } + +func labelsAsSelector(l model.LabelSet) string { + return l.String() +} diff --git a/test/extended/machines/cluster.go b/test/extended/machines/cluster.go index 4dd78b6577c8..5e6cd1866e41 100644 --- a/test/extended/machines/cluster.go +++ b/test/extended/machines/cluster.go @@ -66,7 +66,7 @@ var _ = g.Describe("[sig-node] Managed cluster", func() { ) g.BeforeEach(func() { var ok bool - url, bearerToken, ok = prometheus.LocatePrometheus(oc) + url, _, bearerToken, ok = prometheus.LocatePrometheus(oc) if !ok { e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test") } diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index 00836c8b0170..9a6acbb0b917 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -45,7 +45,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { ) g.BeforeEach(func() { var ok bool - url, bearerToken, ok = helper.LocatePrometheus(oc) + url, _, bearerToken, ok = helper.LocatePrometheus(oc) if !ok { e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test") } @@ -133,12 +133,12 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { var ( oc = exutil.NewCLIWithoutNamespace("prometheus") - url, bearerToken string + url, prometheusURL, bearerToken string ) g.BeforeEach(func() { var ok bool - url, bearerToken, ok = helper.LocatePrometheus(oc) + url, prometheusURL, bearerToken, ok = helper.LocatePrometheus(oc) if !ok { e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test") } @@ -183,7 +183,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { g.By("checking the prometheus metrics path") var metrics map[string]*dto.MetricFamily o.Expect(wait.PollImmediate(10*time.Second, 2*time.Minute, func() (bool, error) { - results, err := getInsecureURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/metrics", url)) + results, err := getInsecureURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/metrics", prometheusURL)) if err != nil { e2e.Logf("unable to get metrics: %v", err) return false, nil @@ -222,7 +222,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { // expect all endpoints within 60 seconds var lastErrs []error o.Expect(wait.PollImmediate(10*time.Second, 2*time.Minute, func() (bool, error) { - contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", url), bearerToken) + contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", prometheusURL), bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) targets := &prometheusTargets{} @@ -269,7 +269,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { g.By("verifying all targets are exposing metrics over secure channel") var insecureTargets []error - contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", url), bearerToken) + contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", prometheusURL), bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) targets := &prometheusTargets{} @@ -299,6 +299,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { } o.Expect(insecureTargets).To(o.BeEmpty(), "some services expose metrics over insecure channel") }) + g.It("should have a AlertmanagerReceiversNotConfigured alert in firing state", func() { ns := oc.SetupNamespace() execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod") @@ -314,6 +315,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { e2e.Logf("AlertmanagerReceiversNotConfigured alert is firing") }) + g.It("should have important platform topology metrics", func() { oc.SetupProject() ns := oc.Namespace() @@ -340,6 +342,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) }) + g.It("should have non-Pod host cAdvisor metrics", func() { ns := oc.SetupNamespace() execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod") @@ -353,6 +356,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) }) + g.It("shouldn't have failing rules evaluation", func() { oc.SetupProject() ns := oc.Namespace() @@ -370,6 +374,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) }) + networking.InOpenShiftSDNContext(func() { g.It("should be able to get the sdn ovs flows", func() { ns := oc.SetupNamespace() @@ -386,6 +391,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { o.Expect(err).NotTo(o.HaveOccurred()) }) }) + g.It("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early]", func() { if len(os.Getenv("TEST_UNSUPPORTED_ALLOW_VERSION_SKEW")) > 0 { e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts") @@ -403,6 +409,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) }) + g.It("should provide ingress metrics", func() { ns := oc.SetupNamespace() @@ -413,7 +420,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { var lastErrs []error o.Expect(wait.PollImmediate(10*time.Second, 4*time.Minute, func() (bool, error) { - contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", url), bearerToken) + contents, err := getBearerTokenURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/api/v1/targets", prometheusURL), bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) targets := &prometheusTargets{} @@ -440,6 +447,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { err := helper.RunQueries(queries, oc, ns, execPod.Name, url, bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) }) + g.It("should provide named network metrics", func() { ns := oc.SetupNamespace() diff --git a/test/extended/prometheus/prometheus_builds.go b/test/extended/prometheus/prometheus_builds.go index 5be6d0927750..1b7dd0e89cc3 100644 --- a/test/extended/prometheus/prometheus_builds.go +++ b/test/extended/prometheus/prometheus_builds.go @@ -26,7 +26,7 @@ var _ = g.Describe("[sig-instrumentation][sig-builds][Feature:Builds] Prometheus ) g.BeforeEach(func() { var ok bool - url, bearerToken, ok = helper.LocatePrometheus(oc) + url, _, bearerToken, ok = helper.LocatePrometheus(oc) if !ok { e2eskipper.Skipf("Prometheus could not be located on this cluster, skipping prometheus test") } diff --git a/test/extended/prometheus/storage.go b/test/extended/prometheus/storage.go index 492ba57bcc1e..84f5f46244ec 100644 --- a/test/extended/prometheus/storage.go +++ b/test/extended/prometheus/storage.go @@ -29,7 +29,7 @@ var _ = g.Describe("[sig-storage][Late] Metrics", func() { g.BeforeEach(func() { var ok bool - url, bearerToken, ok = helper.LocatePrometheus(oc) + url, _, bearerToken, ok = helper.LocatePrometheus(oc) if !ok { e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test") } diff --git a/test/extended/util/prometheus/helpers.go b/test/extended/util/prometheus/helpers.go index f47ed59aa8db..2744c857b8cd 100644 --- a/test/extended/util/prometheus/helpers.go +++ b/test/extended/util/prometheus/helpers.go @@ -62,11 +62,11 @@ func waitForServiceAccountInNamespace(c clientset.Interface, ns, serviceAccountN return err } -// LocatePrometheus uses an exisitng CLI to return information used to make http requests to Prometheus -func LocatePrometheus(oc *exutil.CLI) (url, bearerToken string, ok bool) { +// LocatePrometheus uses an exisitng CLI to return information used to make http requests to Prometheus. +func LocatePrometheus(oc *exutil.CLI) (queryURL, prometheusURL, bearerToken string, ok bool) { _, err := oc.AdminKubeClient().CoreV1().Services("openshift-monitoring").Get(context.Background(), "prometheus-k8s", metav1.GetOptions{}) if kapierrs.IsNotFound(err) { - return "", "", false + return "", "", "", false } waitForServiceAccountInNamespace(oc.AdminKubeClient(), "openshift-monitoring", "prometheus-k8s", 2*time.Minute) @@ -91,18 +91,27 @@ func LocatePrometheus(oc *exutil.CLI) (url, bearerToken string, ok bool) { } o.Expect(bearerToken).ToNot(o.BeEmpty()) - return "https://prometheus-k8s.openshift-monitoring.svc:9091", bearerToken, true + return "https://thanos-querier.openshift-monitoring.svc:9091", "https://prometheus-k8s.openshift-monitoring.svc:9091", bearerToken, true } -// ExpectPrometheus uses an existing framework to return information used to make http requests to Prometheus -func ExpectPrometheus(f *framework.Framework) (url, bearerToken string, oc *exutil.CLI, ok bool) { - - // Must use version that's can run within Ginkgo It - oc = exutil.NewCLIWithFramework(f) - - url, bearerToken, ok = LocatePrometheus(oc) +func RunQuery(query, ns, execPodName, baseURL, bearerToken string) (*PrometheusResponse, error) { + url := fmt.Sprintf("%s/api/v1/query?%s", baseURL, (url.Values{"query": []string{query}}).Encode()) + contents, err := GetBearerTokenURLViaPod(ns, execPodName, url, bearerToken) + if err != nil { + return nil, fmt.Errorf("unable to execute query %s: %v", query, err) + } - return url, bearerToken, oc, ok + // check query result, if this is a new error log it, otherwise remain silent + var result PrometheusResponse + if err := json.Unmarshal([]byte(contents), &result); err != nil { + return nil, fmt.Errorf("unable to parse query response for %s: %v", query, err) + } + metrics := result.Data.Result + if result.Status != "success" { + data, _ := json.MarshalIndent(metrics, "", " ") + return nil, fmt.Errorf("promQL query: %s had reported incorrect status:\n%s", query, data) + } + return &result, nil } // RunQueries executes Prometheus queries and checks provided expected result. @@ -119,33 +128,21 @@ func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, ba // and introduced at https://github.com/prometheus/client_golang/blob/master/api/prometheus/v1/api.go are vendored into // openshift/origin, look to replace this homegrown http request / query param with that API g.By("perform prometheus metric query " + query) - url := fmt.Sprintf("%s/api/v1/query?%s", baseURL, (url.Values{"query": []string{query}}).Encode()) - contents, err := GetBearerTokenURLViaPod(ns, execPodName, url, bearerToken) + result, err := RunQuery(query, ns, execPodName, baseURL, bearerToken) if err != nil { - return err - } - - // check query result, if this is a new error log it, otherwise remain silent - var result PrometheusResponse - if err := json.Unmarshal([]byte(contents), &result); err != nil { - framework.Logf("unable to parse query response for %s: %v", query, err) - continue - } - metrics := result.Data.Result - if result.Status != "success" { - data, _ := json.Marshal(metrics) - msg := fmt.Sprintf("promQL query: %s had reported incorrect status:\n%s", query, data) - if prev, ok := queryErrors[query]; !ok || prev.Error() != msg { - framework.Logf("%s", msg) + msg := err.Error() + if prev, ok := queryErrors[query]; ok && prev.Error() != msg { + framework.Logf("%s", prev.Error()) } - queryErrors[query] = fmt.Errorf(msg) + queryErrors[query] = err continue } + metrics := result.Data.Result if (len(metrics) > 0 && !expected) || (len(metrics) == 0 && expected) { - data, _ := json.Marshal(metrics) - msg := fmt.Sprintf("promQL query: %s had reported incorrect results:\n%s", query, data) - if prev, ok := queryErrors[query]; !ok || prev.Error() != msg { - framework.Logf("%s", msg) + data, _ := json.MarshalIndent(result.Data.Result, "", " ") + msg := fmt.Sprintf("promQL query returned unexpected results:\n%s\n%s", query, data) + if prev, ok := queryErrors[query]; ok && prev.Error() != msg { + framework.Logf("%s", prev.Error()) } queryErrors[query] = fmt.Errorf(msg) continue @@ -163,11 +160,11 @@ func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, ba } if len(queryErrors) != 0 { - exutil.DumpPodLogsStartingWith("prometheus-0", oc) + exutil.DumpPodLogsStartingWith("prometheus-k8s-", oc) } var errs []error - for query, err := range queryErrors { - errs = append(errs, fmt.Errorf("query failed: %s: %s", query, err)) + for _, err := range queryErrors { + errs = append(errs, err) } return errors.NewAggregate(errs) }