Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 57 additions & 3 deletions test/extended/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"

configv1 "github.com/openshift/api/config/v1"
v1 "k8s.io/api/core/v1"

kapierrs "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -58,6 +59,10 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
if len(os.Getenv("TEST_UNSUPPORTED_ALLOW_VERSION_SKEW")) > 0 {
e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts")
}

infra, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(), "cluster", metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())

ns := oc.SetupNamespace()
execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod")
defer func() {
Expand All @@ -71,6 +76,38 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
},
}

if infra.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode {
firingAlertsWithBugs = append(firingAlertsWithBugs, helper.MetricConditions{
{
Selector: map[string]string{"alertname": "KubeMemoryOvercommit"},
Text: "https://issues.redhat.com/browse/MON-1522",
},
{
Selector: map[string]string{"alertname": "KubeCPUOvercommit"},
Text: "https://issues.redhat.com/browse/MON-1522",
},
{
Selector: map[string]string{"alertname": "SystemMemoryExceedsReservation"},
Text: "https://bugzilla.redhat.com/show_bug.cgi?id=1945017",
},
//TODO: API-1136 ticket unimplemented - API server rollouts cause API downtime in single replica toplogy.
// As a result, some pods crash which causes some aggregated APIs to be unavailable.
// Until this is resolved we ignore those AggregatedAPIDown alerts
{
Selector: map[string]string{"alertname": "AggregatedAPIDown", "name": "v1.oauth.openshift.io"},
Text: "https://issues.redhat.com/browse/API-1136",
},
{
Selector: map[string]string{"alertname": "AggregatedAPIDown", "name": "v1.packages.operators.coreos.com"},
Text: "https://issues.redhat.com/browse/API-1136",
},
{
Selector: map[string]string{"alertname": "AggregatedAPIDown", "name": "v1.user.openshift.io"},
Text: "https://issues.redhat.com/browse/API-1136",
},
}...)
}

pendingAlertsWithBugs := helper.MetricConditions{}
allowedPendingAlerts := helper.MetricConditions{}

Expand Down Expand Up @@ -446,11 +483,28 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
oc.AdminKubeClient().CoreV1().Pods(ns).Delete(context.Background(), execPod.Name, *metav1.NewDeleteOptions(1))
}()

infra, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(), "cluster", metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())

// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
allowedAlerts := []string{"Watchdog", "AlertmanagerReceiversNotConfigured", "PrometheusRemoteWriteDesiredShards"}
if infra.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode {
//TODO: MON-1522 bug - these alerts wrongly fire in single replica topology, remove this if statement once that bug is fixed.
allowedAlerts = append(allowedAlerts, "KubeMemoryOvercommit", "KubeCPUOvercommit")

//TODO: API-1136 ticket unimplemented - API server rollouts cause API downtime in single replica toplogy.
// As a result, some pods crash which causes some aggregated APIs to be unavailable.
// Until this is resolved we ignore AggregatedAPIDown alerts
allowedAlerts = append(allowedAlerts, "AggregatedAPIDown")

//TODO: https://bugzilla.redhat.com/show_bug.cgi?id=1945017
allowedAlerts = append(allowedAlerts, "SystemMemoryExceedsReservation")
}

tests := map[string]bool{
// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
`ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|PrometheusRemoteWriteDesiredShards",alertstate="firing",severity!="info"} >= 1`: false,
fmt.Sprintf(`ALERTS{alertname!~"%s",alertstate="firing",severity!="info"} >= 1`, strings.Join(allowedAlerts, "|")): false,
}
err := helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
err = helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
o.Expect(err).NotTo(o.HaveOccurred())
})

Expand Down