diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go new file mode 100644 index 000000000000..558e9a38e0dd --- /dev/null +++ b/test/e2e/upgrade/alert/alert.go @@ -0,0 +1,103 @@ +package alert + +import ( + "context" + "fmt" + "time" + + g "github.com/onsi/ginkgo" + + exutil "github.com/openshift/origin/test/extended/util" + helper "github.com/openshift/origin/test/extended/util/prometheus" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/test/e2e/framework" + e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + "k8s.io/kubernetes/test/e2e/upgrades" +) + +const ( + // Delay after upgrade is complete before checking for critical alerts + alertCheckSleepMinutes = 5 + alertCheckSleep = alertCheckSleepMinutes * time.Minute + + // Previous period in which to check for critical alerts + alertPeriodCheckMinutes = 1 +) + +// UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing. +type UpgradeTest struct { + url string + bearerToken string + oc *exutil.CLI +} + +func (UpgradeTest) Name() string { return "check-for-critical-alerts" } +func (UpgradeTest) DisplayName() string { + return "Check if critical alerts are firing after upgrade success" +} + +// Setup creates parameters to query Prometheus +func (t *UpgradeTest) Setup(f *framework.Framework) { + g.By("Setting up post-upgrade alert test") + + url, bearerToken, oc, ok := helper.ExpectPrometheus(f) + if !ok { + framework.Failf("Prometheus could not be located on this cluster, failing test %s", t.Name()) + } + t.url = url + t.bearerToken = bearerToken + t.oc = oc + framework.Logf("Post-upgrade alert test setup complete") +} + +// Test checks if any critical alerts are firing. +func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade upgrades.UpgradeType) { + g.By("Checking for critical alerts") + + // Recover current test if it fails so test suite can complete + defer g.GinkgoRecover() + + // Block until upgrade is done + g.By("Waiting for upgrade to finish before checking for critical alerts") + <-done + + ctx, cancel := context.WithCancel(context.Background()) + + // Additonal delay after upgrade completion + g.By("Waiting before checking for critical alerts") + time.Sleep(alertCheckSleep) + cancel() + + if helper.TestUnsupportedAllowVersionSkew() { + e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts") + } + t.oc.SetupProject() + ns := t.oc.Namespace() + execPod := exutil.CreateCentosExecPodOrFail(t.oc.AdminKubeClient(), ns, "execpod", nil) + defer func() { + t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(ctx, execPod.Name, *metav1.NewDeleteOptions(1)) + }() + + // Query to check if Prometheus has been up and running for entire post-upgrade + // period by verifying Watchdog alert has been in firing state + watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes) + + // Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes. + // TODO Remove KubeAPIErrorBudgetBurn from ignore list once Bug 1821661 is fixed. + criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh|KubeAPIErrorBudgetBurn",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes) + + tests := map[string]bool{ + watchdogQuery: true, + criticalAlertQuery: false, + } + + helper.RunQueries(tests, t.oc, ns, execPod.Name, t.url, t.bearerToken) + + framework.Logf("No crtical alerts firing post-upgrade") +} + +// Teardown cleans up any remaining resources. +func (t *UpgradeTest) Teardown(f *framework.Framework) { + // rely on the namespace deletion to clean up everything +} diff --git a/test/e2e/upgrade/upgrade.go b/test/e2e/upgrade/upgrade.go index 45bc72ac24f4..8cd65c3fb1cd 100644 --- a/test/e2e/upgrade/upgrade.go +++ b/test/e2e/upgrade/upgrade.go @@ -26,6 +26,7 @@ import ( configv1 "github.com/openshift/api/config/v1" configv1client "github.com/openshift/client-go/config/clientset/versioned" + "github.com/openshift/origin/test/e2e/upgrade/alert" "github.com/openshift/origin/test/e2e/upgrade/service" "github.com/openshift/origin/test/extended/util/disruption" "github.com/openshift/origin/test/extended/util/disruption/controlplane" @@ -36,6 +37,7 @@ func AllTests() []upgrades.Test { return []upgrades.Test{ &controlplane.KubeAvailableTest{}, &controlplane.OpenShiftAvailableTest{}, + &alert.UpgradeTest{}, &frontends.AvailableTest{}, &service.UpgradeTest{}, &upgrades.SecretUpgradeTest{}, diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index d9b8aa6b550c..68b42e5370b0 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -33,6 +33,7 @@ import ( "github.com/openshift/origin/test/extended/networking" exutil "github.com/openshift/origin/test/extended/util" "github.com/openshift/origin/test/extended/util/ibmcloud" + helper "github.com/openshift/origin/test/extended/util/prometheus" ) var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { @@ -44,7 +45,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { ) g.BeforeEach(func() { var ok bool - url, bearerToken, ok = locatePrometheus(oc) + url, bearerToken, ok = helper.LocatePrometheus(oc) if !ok { e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test") } @@ -65,7 +66,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { // Checking Watchdog alert state is done in "should have a Watchdog alert in firing state". `count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity!="info"}[2h]) >= 1`: false, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) }) g.It("should have a Watchdog alert in firing state the entire cluster run", func() { @@ -80,7 +81,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { // should have constantly firing a watchdog alert `count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[1h])`: true, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) e2e.Logf("Watchdog alert is firing") }) @@ -102,7 +103,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { // rule contains the count of the all the series that are sent via telemetry. `max_over_time(cluster:telemetry_selected_series:count[2h]) >= 500`: false, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) e2e.Logf("Total number of series sent via telemetry is below the limit") }) @@ -119,7 +120,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { g.BeforeEach(func() { var ok bool - url, bearerToken, ok = locatePrometheus(oc) + url, bearerToken, ok = helper.LocatePrometheus(oc) if !ok { e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test") } @@ -149,7 +150,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { // should have scraped some metrics from prometheus `federate_samples{job="telemeter-client"} >= 10`: true, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) e2e.Logf("Telemetry is enabled: %s", bearerToken) }) @@ -193,7 +194,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { })).NotTo(o.HaveOccurred(), fmt.Sprintf("Did not find tsdb_samples_appended_total, tsdb_head_samples_appended_total, or prometheus_tsdb_head_samples_appended_total")) g.By("verifying the oauth-proxy reports a 403 on the root URL") - err := expectURLStatusCodeExec(ns, execPod.Name, url, 403) + err := helper.ExpectURLStatusCodeExec(ns, execPod.Name, url, 403) o.Expect(err).NotTo(o.HaveOccurred()) g.By("verifying a service account token is able to authenticate") @@ -304,7 +305,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { // should have constantly firing a watchdog alert `ALERTS{alertstate="firing",alertname="AlertmanagerReceiversNotConfigured"} == 1`: true, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) e2e.Logf("AlertmanagerReceiversNotConfigured alert is firing") }) @@ -331,7 +332,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { `sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_kubernetes_io_arch!="",label_node_role_kubernetes_io_master!=""}) > 0`: true, `sum(node_role_os_version_machine:cpu_capacity_sockets:sum{label_kubernetes_io_arch!="",label_node_hyperthread_enabled!="",label_node_role_kubernetes_io_master!=""}) > 0`: true, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) }) g.It("should have non-Pod host cAdvisor metrics", func() { oc.SetupProject() @@ -344,7 +345,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { tests := map[string]bool{ `container_cpu_usage_seconds_total{id!~"/kubepods.slice/.*"} >= 1`: true, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) }) g.It("shouldn't have failing rules evaluation", func() { oc.SetupProject() @@ -357,7 +358,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { tests := map[string]bool{ `prometheus_rule_evaluation_failures_total >= 1`: false, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) }) networking.InOpenShiftSDNContext(func() { g.It("should be able to get the sdn ovs flows", func() { @@ -372,7 +373,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { //something `openshift_sdn_ovs_flows >= 1`: true, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) }) }) g.It("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early]", func() { @@ -390,7 +391,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { // Checking Watchdog alert state is done in "should have a Watchdog alert in firing state". `ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|PrometheusRemoteWriteDesiredShards",alertstate="firing",severity!="info"} >= 1`: false, } - runQueries(tests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken) }) g.It("should provide ingress metrics", func() { oc.SetupProject() @@ -427,7 +428,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() { `template_router_reload_seconds_count{job="router-internal-default"} >= 1`: true, `haproxy_server_up{job="router-internal-default"} >= 1`: true, } - runQueries(queries, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(queries, oc, ns, execPod.Name, url, bearerToken) }) }) }) @@ -548,18 +549,6 @@ func findMetricLabels(f *dto.MetricFamily, labels map[string]string, match strin return result } -func expectURLStatusCodeExec(ns, execPodName, url string, statusCode int) error { - cmd := fmt.Sprintf("curl -k -s -o /dev/null -w '%%{http_code}' %q", url) - output, err := e2e.RunHostCmd(ns, execPodName, cmd) - if err != nil { - return fmt.Errorf("host command failed: %v\n%s", err, output) - } - if output != strconv.Itoa(statusCode) { - return fmt.Errorf("last response from server was not %d: %s", statusCode, output) - } - return nil -} - func expectBearerTokenURLStatusCodeExec(ns, execPodName, url, bearer string, statusCode int) error { cmd := fmt.Sprintf("curl -k -s -H 'Authorization: Bearer %s' -o /dev/null -w '%%{http_code}' %q", bearer, url) output, err := e2e.RunHostCmd(ns, execPodName, cmd) diff --git a/test/extended/prometheus/prometheus_builds.go b/test/extended/prometheus/prometheus_builds.go index 57a6e8355e46..1973ca428ae9 100644 --- a/test/extended/prometheus/prometheus_builds.go +++ b/test/extended/prometheus/prometheus_builds.go @@ -2,14 +2,10 @@ package prometheus import ( "context" - "encoding/json" "fmt" - "net/url" - "time" g "github.com/onsi/ginkgo" o "github.com/onsi/gomega" - "github.com/prometheus/common/model" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" e2e "k8s.io/kubernetes/test/e2e/framework" @@ -18,11 +14,7 @@ import ( buildv1 "github.com/openshift/api/build/v1" exutil "github.com/openshift/origin/test/extended/util" "github.com/openshift/origin/test/extended/util/ibmcloud" -) - -const ( - maxPrometheusQueryAttempts = 5 - prometheusQueryRetrySleep = 10 * time.Second + helper "github.com/openshift/origin/test/extended/util/prometheus" ) var _ = g.Describe("[sig-instrumentation][sig-builds][Feature:Builds] Prometheus", func() { @@ -34,7 +26,7 @@ var _ = g.Describe("[sig-instrumentation][sig-builds][Feature:Builds] Prometheus ) g.BeforeEach(func() { var ok bool - url, bearerToken, ok = locatePrometheus(oc) + url, bearerToken, ok = helper.LocatePrometheus(oc) if !ok { e2eskipper.Skipf("Prometheus could not be located on this cluster, skipping prometheus test") } @@ -67,19 +59,11 @@ var _ = g.Describe("[sig-instrumentation][sig-builds][Feature:Builds] Prometheus g.By("verifying the oauth-proxy reports a 403 on the root URL") // allow for some retry, a la prometheus.go and its initial hitting of the metrics endpoint after - // instantiating prometheus tempalte - var err error - for i := 0; i < maxPrometheusQueryAttempts; i++ { - err = expectURLStatusCodeExec(ns, execPod.Name, url, 403) - if err == nil { - break - } - time.Sleep(prometheusQueryRetrySleep) - } - o.Expect(err).NotTo(o.HaveOccurred()) + // instantiating prometheus template + helper.ExpectPrometheusEndpoint(ns, execPod.Name, url) g.By("verifying a service account token is able to authenticate") - err = expectBearerTokenURLStatusCodeExec(ns, execPod.Name, fmt.Sprintf("%s/graph", url), bearerToken, 200) + err := expectBearerTokenURLStatusCodeExec(ns, execPod.Name, fmt.Sprintf("%s/graph", url), bearerToken, 200) o.Expect(err).NotTo(o.HaveOccurred()) br := startOpenShiftBuild(oc, appTemplate) @@ -95,7 +79,7 @@ var _ = g.Describe("[sig-instrumentation][sig-builds][Feature:Builds] Prometheus terminalTests := map[string]bool{ buildCountMetricName: true, } - runQueries(terminalTests, oc, ns, execPod.Name, url, bearerToken) + helper.RunQueries(terminalTests, oc, ns, execPod.Name, url, bearerToken) // NOTE: in manual testing on a laptop, starting several serial builds in succession was sufficient for catching // at least a few builds in new/pending state with the default prometheus query interval; but that has not @@ -105,76 +89,6 @@ var _ = g.Describe("[sig-instrumentation][sig-builds][Feature:Builds] Prometheus }) }) -type prometheusResponse struct { - Status string `json:"status"` - Data prometheusResponseData `json:"data"` -} - -type prometheusResponseData struct { - ResultType string `json:"resultType"` - Result model.Vector `json:"result"` -} - -func runQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, baseURL, bearerToken string) { - // expect all correct metrics within a reasonable time period - queryErrors := make(map[string]error) - passed := make(map[string]struct{}) - for i := 0; i < maxPrometheusQueryAttempts; i++ { - for query, expected := range promQueries { - if _, ok := passed[query]; ok { - continue - } - //TODO when the http/query apis discussed at https://github.com/prometheus/client_golang#client-for-the-prometheus-http-api - // and introduced at https://github.com/prometheus/client_golang/blob/master/api/prometheus/v1/api.go are vendored into - // openshift/origin, look to replace this homegrown http request / query param with that API - g.By("perform prometheus metric query " + query) - url := fmt.Sprintf("%s/api/v1/query?%s", baseURL, (url.Values{"query": []string{query}}).Encode()) - contents, err := getBearerTokenURLViaPod(ns, execPodName, url, bearerToken) - o.Expect(err).NotTo(o.HaveOccurred()) - - // check query result, if this is a new error log it, otherwise remain silent - var result prometheusResponse - if err := json.Unmarshal([]byte(contents), &result); err != nil { - e2e.Logf("unable to parse query response for %s: %v", query, err) - continue - } - metrics := result.Data.Result - if result.Status != "success" { - data, _ := json.Marshal(metrics) - msg := fmt.Sprintf("promQL query: %s had reported incorrect status:\n%s", query, data) - if prev, ok := queryErrors[query]; !ok || prev.Error() != msg { - e2e.Logf("%s", msg) - } - queryErrors[query] = fmt.Errorf(msg) - continue - } - if (len(metrics) > 0 && !expected) || (len(metrics) == 0 && expected) { - data, _ := json.Marshal(metrics) - msg := fmt.Sprintf("promQL query: %s had reported incorrect results:\n%s", query, data) - if prev, ok := queryErrors[query]; !ok || prev.Error() != msg { - e2e.Logf("%s", msg) - } - queryErrors[query] = fmt.Errorf(msg) - continue - } - - // query successful - passed[query] = struct{}{} - delete(queryErrors, query) - } - - if len(queryErrors) == 0 { - break - } - time.Sleep(prometheusQueryRetrySleep) - } - - if len(queryErrors) != 0 { - exutil.DumpPodLogsStartingWith("prometheus-0", oc) - } - o.Expect(queryErrors).To(o.BeEmpty()) -} - func startOpenShiftBuild(oc *exutil.CLI, appTemplate string) *exutil.BuildResult { g.By(fmt.Sprintf("calling oc create -f %s ", appTemplate)) err := oc.Run("create").Args("-f", appTemplate).Execute() diff --git a/test/extended/util/client.go b/test/extended/util/client.go index 89f07ca2ae42..1eff323c8f7f 100644 --- a/test/extended/util/client.go +++ b/test/extended/util/client.go @@ -91,6 +91,18 @@ type resourceRef struct { Name string } +// NewCLIWithFramework initializes the CLI using the provided Kube +// framework. It can be called inside of a Ginkgo .It() function. +func NewCLIWithFramework(kubeFramework *framework.Framework) *CLI { + cli := &CLI{ + kubeFramework: kubeFramework, + username: "admin", + execPath: "oc", + adminConfigPath: KubeConfigPath(), + } + return cli +} + // NewCLI initializes the CLI and Kube framework helpers with the provided // namespace. Should be called outside of a Ginkgo .It() function. func NewCLI(project string) *CLI { diff --git a/test/extended/util/prometheus/helpers.go b/test/extended/util/prometheus/helpers.go new file mode 100644 index 000000000000..498d76716df1 --- /dev/null +++ b/test/extended/util/prometheus/helpers.go @@ -0,0 +1,203 @@ +package prometheus + +import ( + "context" + "encoding/json" + "fmt" + "net/url" + "os" + "strconv" + "strings" + "time" + + g "github.com/onsi/ginkgo" + o "github.com/onsi/gomega" + + exutil "github.com/openshift/origin/test/extended/util" + "github.com/prometheus/common/model" + + v1 "k8s.io/api/core/v1" + kapierrs "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clientset "k8s.io/client-go/kubernetes" + watchtools "k8s.io/client-go/tools/watch" + "k8s.io/kubernetes/pkg/client/conditions" + "k8s.io/kubernetes/test/e2e/framework" +) + +const ( + maxPrometheusQueryAttempts = 5 + prometheusQueryRetrySleep = 10 * time.Second +) + +// PrometheusResponse is used to contain prometheus query results +type PrometheusResponse struct { + Status string `json:"status"` + Data prometheusResponseData `json:"data"` +} + +type prometheusResponseData struct { + ResultType string `json:"resultType"` + Result model.Vector `json:"result"` +} + +// TestUnsupportedAllowVersionSkew returns whether TEST_UNSUPPORTED_ALLOW_VERSION_SKEW is set +func TestUnsupportedAllowVersionSkew() bool { + if len(os.Getenv("TEST_UNSUPPORTED_ALLOW_VERSION_SKEW")) > 0 { + return true + } + return false +} + +// GetBearerTokenURLViaPod makes http request through given pod +func GetBearerTokenURLViaPod(ns, execPodName, url, bearer string) (string, error) { + cmd := fmt.Sprintf("curl -s -k -H 'Authorization: Bearer %s' %q", bearer, url) + output, err := framework.RunHostCmd(ns, execPodName, cmd) + if err != nil { + return "", fmt.Errorf("host command failed: %v\n%s", err, output) + } + return output, nil +} + +func waitForServiceAccountInNamespace(c clientset.Interface, ns, serviceAccountName string, timeout time.Duration) error { + w, err := c.CoreV1().ServiceAccounts(ns).Watch(context.Background(), metav1.SingleObject(metav1.ObjectMeta{Name: serviceAccountName})) + if err != nil { + return err + } + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + _, err = watchtools.UntilWithoutRetry(ctx, w, conditions.ServiceAccountHasSecrets) + return err +} + +// LocatePrometheus uses an exisitng CLI to return information used to make http requests to Prometheus +func LocatePrometheus(oc *exutil.CLI) (url, bearerToken string, ok bool) { + _, err := oc.AdminKubeClient().CoreV1().Services("openshift-monitoring").Get(context.Background(), "prometheus-k8s", metav1.GetOptions{}) + if kapierrs.IsNotFound(err) { + return "", "", false + } + + waitForServiceAccountInNamespace(oc.AdminKubeClient(), "openshift-monitoring", "prometheus-k8s", 2*time.Minute) + for i := 0; i < 30; i++ { + secrets, err := oc.AdminKubeClient().CoreV1().Secrets("openshift-monitoring").List(context.Background(), metav1.ListOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + for _, secret := range secrets.Items { + if secret.Type != v1.SecretTypeServiceAccountToken { + continue + } + if !strings.HasPrefix(secret.Name, "prometheus-") { + continue + } + bearerToken = string(secret.Data[v1.ServiceAccountTokenKey]) + break + } + if len(bearerToken) == 0 { + framework.Logf("Waiting for prometheus service account secret to show up") + time.Sleep(time.Second) + continue + } + } + o.Expect(bearerToken).ToNot(o.BeEmpty()) + + return "https://prometheus-k8s.openshift-monitoring.svc:9091", bearerToken, true +} + +// ExpectPrometheus uses an existing framework to return information used to make http requests to Prometheus +func ExpectPrometheus(f *framework.Framework) (url, bearerToken string, oc *exutil.CLI, ok bool) { + + // Must use version that's can run within Ginkgo It + oc = exutil.NewCLIWithFramework(f) + + url, bearerToken, ok = LocatePrometheus(oc) + + return url, bearerToken, oc, ok +} + +// RunQueries executes Prometheus queries and checks provided expected result. +func RunQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, baseURL, bearerToken string) { + // expect all correct metrics within a reasonable time period + queryErrors := make(map[string]error) + passed := make(map[string]struct{}) + for i := 0; i < maxPrometheusQueryAttempts; i++ { + for query, expected := range promQueries { + if _, ok := passed[query]; ok { + continue + } + //TODO when the http/query apis discussed at https://github.com/prometheus/client_golang#client-for-the-prometheus-http-api + // and introduced at https://github.com/prometheus/client_golang/blob/master/api/prometheus/v1/api.go are vendored into + // openshift/origin, look to replace this homegrown http request / query param with that API + g.By("perform prometheus metric query " + query) + url := fmt.Sprintf("%s/api/v1/query?%s", baseURL, (url.Values{"query": []string{query}}).Encode()) + contents, err := GetBearerTokenURLViaPod(ns, execPodName, url, bearerToken) + o.Expect(err).NotTo(o.HaveOccurred()) + + // check query result, if this is a new error log it, otherwise remain silent + var result PrometheusResponse + if err := json.Unmarshal([]byte(contents), &result); err != nil { + framework.Logf("unable to parse query response for %s: %v", query, err) + continue + } + metrics := result.Data.Result + if result.Status != "success" { + data, _ := json.Marshal(metrics) + msg := fmt.Sprintf("promQL query: %s had reported incorrect status:\n%s", query, data) + if prev, ok := queryErrors[query]; !ok || prev.Error() != msg { + framework.Logf("%s", msg) + } + queryErrors[query] = fmt.Errorf(msg) + continue + } + if (len(metrics) > 0 && !expected) || (len(metrics) == 0 && expected) { + data, _ := json.Marshal(metrics) + msg := fmt.Sprintf("promQL query: %s had reported incorrect results:\n%s", query, data) + if prev, ok := queryErrors[query]; !ok || prev.Error() != msg { + framework.Logf("%s", msg) + } + queryErrors[query] = fmt.Errorf(msg) + continue + } + + // query successful + passed[query] = struct{}{} + delete(queryErrors, query) + } + + if len(queryErrors) == 0 { + break + } + time.Sleep(prometheusQueryRetrySleep) + } + + if len(queryErrors) != 0 { + exutil.DumpPodLogsStartingWith("prometheus-0", oc) + } + o.Expect(queryErrors).To(o.BeEmpty()) +} + +// ExpectURLStatusCodeExec attempts connection to url returning an error +// upon failure or if status return code is not equal to statusCode. +func ExpectURLStatusCodeExec(ns, execPodName, url string, statusCode int) error { + cmd := fmt.Sprintf("curl -k -s -o /dev/null -w '%%{http_code}' %q", url) + output, err := framework.RunHostCmd(ns, execPodName, cmd) + if err != nil { + return fmt.Errorf("host command failed: %v\n%s", err, output) + } + if output != strconv.Itoa(statusCode) { + return fmt.Errorf("last response from server was not %d: %s", statusCode, output) + } + return nil +} + +// ExpectPrometheusEndpoint attempts to connect to the metrics endpoint with +// delayed retries upon failure. +func ExpectPrometheusEndpoint(namespace, podName, url string) { + var err error + for i := 0; i < maxPrometheusQueryAttempts; i++ { + err = ExpectURLStatusCodeExec(namespace, podName, url, 403) + if err == nil { + break + } + time.Sleep(prometheusQueryRetrySleep) + } + o.Expect(err).NotTo(o.HaveOccurred()) +}