Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions test/e2e/upgrade/alert/alert.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package alert

import (
"context"
"fmt"
"time"

g "github.com/onsi/ginkgo"

exutil "github.com/openshift/origin/test/extended/util"
helper "github.com/openshift/origin/test/extended/util/prometheus"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/test/e2e/framework"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
"k8s.io/kubernetes/test/e2e/upgrades"
)

const (
// Delay after upgrade is complete before checking for critical alerts
alertCheckSleepMinutes = 5
alertCheckSleep = alertCheckSleepMinutes * time.Minute

// Previous period in which to check for critical alerts
alertPeriodCheckMinutes = 1
)

// UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing.
type UpgradeTest struct {
url string
bearerToken string
oc *exutil.CLI
}

func (UpgradeTest) Name() string { return "check-for-critical-alerts" }
func (UpgradeTest) DisplayName() string {
return "Check if critical alerts are firing after upgrade success"
}

// Setup creates parameters to query Prometheus
func (t *UpgradeTest) Setup(f *framework.Framework) {
g.By("Setting up post-upgrade alert test")

url, bearerToken, oc, ok := helper.ExpectPrometheus(f)
if !ok {
framework.Failf("Prometheus could not be located on this cluster, failing test %s", t.Name())
}
t.url = url
t.bearerToken = bearerToken
t.oc = oc
framework.Logf("Post-upgrade alert test setup complete")
}

// Test checks if any critical alerts are firing.
func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade upgrades.UpgradeType) {
g.By("Checking for critical alerts")

// Recover current test if it fails so test suite can complete
defer g.GinkgoRecover()

// Block until upgrade is done
g.By("Waiting for upgrade to finish before checking for critical alerts")
<-done

ctx, cancel := context.WithCancel(context.Background())

// Additonal delay after upgrade completion
g.By("Waiting before checking for critical alerts")
time.Sleep(alertCheckSleep)
cancel()

if helper.TestUnsupportedAllowVersionSkew() {
e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts")
}
t.oc.SetupProject()
ns := t.oc.Namespace()
execPod := exutil.CreateCentosExecPodOrFail(t.oc.AdminKubeClient(), ns, "execpod", nil)
defer func() {
t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(ctx, execPod.Name, *metav1.NewDeleteOptions(1))
}()

// Query to check if Prometheus has been up and running for entire post-upgrade
// period by verifying Watchdog alert has been in firing state
watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes)

// Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes.
// TODO Remove KubeAPIErrorBudgetBurn from ignore list once Bug 1821661 is fixed.
criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh|KubeAPIErrorBudgetBurn",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes)

tests := map[string]bool{
watchdogQuery: true,
criticalAlertQuery: false,
}

helper.RunQueries(tests, t.oc, ns, execPod.Name, t.url, t.bearerToken)

framework.Logf("No crtical alerts firing post-upgrade")
}

// Teardown cleans up any remaining resources.
func (t *UpgradeTest) Teardown(f *framework.Framework) {
// rely on the namespace deletion to clean up everything
}
2 changes: 2 additions & 0 deletions test/e2e/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

configv1 "github.com/openshift/api/config/v1"
configv1client "github.com/openshift/client-go/config/clientset/versioned"
"github.com/openshift/origin/test/e2e/upgrade/alert"
"github.com/openshift/origin/test/e2e/upgrade/service"
"github.com/openshift/origin/test/extended/util/disruption"
"github.com/openshift/origin/test/extended/util/disruption/controlplane"
Expand All @@ -36,6 +37,7 @@ func AllTests() []upgrades.Test {
return []upgrades.Test{
&controlplane.KubeAvailableTest{},
&controlplane.OpenShiftAvailableTest{},
&alert.UpgradeTest{},
&frontends.AvailableTest{},
&service.UpgradeTest{},
&upgrades.SecretUpgradeTest{},
Expand Down
41 changes: 15 additions & 26 deletions test/extended/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/openshift/origin/test/extended/networking"
exutil "github.com/openshift/origin/test/extended/util"
"github.com/openshift/origin/test/extended/util/ibmcloud"
helper "github.com/openshift/origin/test/extended/util/prometheus"
)

var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
Expand All @@ -44,7 +45,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
)
g.BeforeEach(func() {
var ok bool
url, bearerToken, ok = locatePrometheus(oc)
url, bearerToken, ok = helper.LocatePrometheus(oc)
if !ok {
e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
}
Expand All @@ -65,7 +66,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity!="info"}[2h]) >= 1`: false,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})

g.It("should have a Watchdog alert in firing state the entire cluster run", func() {
Expand All @@ -80,7 +81,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
// should have constantly firing a watchdog alert
`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[1h])`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)

e2e.Logf("Watchdog alert is firing")
})
Expand All @@ -102,7 +103,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
// rule contains the count of the all the series that are sent via telemetry.
`max_over_time(cluster:telemetry_selected_series:count[2h]) >= 500`: false,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)

e2e.Logf("Total number of series sent via telemetry is below the limit")
})
Expand All @@ -119,7 +120,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {

g.BeforeEach(func() {
var ok bool
url, bearerToken, ok = locatePrometheus(oc)
url, bearerToken, ok = helper.LocatePrometheus(oc)
if !ok {
e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
}
Expand Down Expand Up @@ -149,7 +150,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
// should have scraped some metrics from prometheus
`federate_samples{job="telemeter-client"} >= 10`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)

e2e.Logf("Telemetry is enabled: %s", bearerToken)
})
Expand Down Expand Up @@ -193,7 +194,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
})).NotTo(o.HaveOccurred(), fmt.Sprintf("Did not find tsdb_samples_appended_total, tsdb_head_samples_appended_total, or prometheus_tsdb_head_samples_appended_total"))

g.By("verifying the oauth-proxy reports a 403 on the root URL")
err := expectURLStatusCodeExec(ns, execPod.Name, url, 403)
err := helper.ExpectURLStatusCodeExec(ns, execPod.Name, url, 403)
o.Expect(err).NotTo(o.HaveOccurred())

g.By("verifying a service account token is able to authenticate")
Expand Down Expand Up @@ -304,7 +305,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
// should have constantly firing a watchdog alert
`ALERTS{alertstate="firing",alertname="AlertmanagerReceiversNotConfigured"} == 1`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)

e2e.Logf("AlertmanagerReceiversNotConfigured alert is firing")
})
Expand All @@ -331,7 +332,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
`sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_kubernetes_io_arch!="",label_node_role_kubernetes_io_master!=""}) > 0`: true,
`sum(node_role_os_version_machine:cpu_capacity_sockets:sum{label_kubernetes_io_arch!="",label_node_hyperthread_enabled!="",label_node_role_kubernetes_io_master!=""}) > 0`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
g.It("should have non-Pod host cAdvisor metrics", func() {
oc.SetupProject()
Expand All @@ -344,7 +345,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
tests := map[string]bool{
`container_cpu_usage_seconds_total{id!~"/kubepods.slice/.*"} >= 1`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
g.It("shouldn't have failing rules evaluation", func() {
oc.SetupProject()
Expand All @@ -357,7 +358,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
tests := map[string]bool{
`prometheus_rule_evaluation_failures_total >= 1`: false,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
networking.InOpenShiftSDNContext(func() {
g.It("should be able to get the sdn ovs flows", func() {
Expand All @@ -372,7 +373,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
//something
`openshift_sdn_ovs_flows >= 1`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
})
g.It("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early]", func() {
Expand All @@ -390,7 +391,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
`ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|PrometheusRemoteWriteDesiredShards",alertstate="firing",severity!="info"} >= 1`: false,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
g.It("should provide ingress metrics", func() {
oc.SetupProject()
Expand Down Expand Up @@ -427,7 +428,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
`template_router_reload_seconds_count{job="router-internal-default"} >= 1`: true,
`haproxy_server_up{job="router-internal-default"} >= 1`: true,
}
runQueries(queries, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(queries, oc, ns, execPod.Name, url, bearerToken)
})
})
})
Expand Down Expand Up @@ -548,18 +549,6 @@ func findMetricLabels(f *dto.MetricFamily, labels map[string]string, match strin
return result
}

func expectURLStatusCodeExec(ns, execPodName, url string, statusCode int) error {
cmd := fmt.Sprintf("curl -k -s -o /dev/null -w '%%{http_code}' %q", url)
output, err := e2e.RunHostCmd(ns, execPodName, cmd)
if err != nil {
return fmt.Errorf("host command failed: %v\n%s", err, output)
}
if output != strconv.Itoa(statusCode) {
return fmt.Errorf("last response from server was not %d: %s", statusCode, output)
}
return nil
}

func expectBearerTokenURLStatusCodeExec(ns, execPodName, url, bearer string, statusCode int) error {
cmd := fmt.Sprintf("curl -k -s -H 'Authorization: Bearer %s' -o /dev/null -w '%%{http_code}' %q", bearer, url)
output, err := e2e.RunHostCmd(ns, execPodName, cmd)
Expand Down
Loading