diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index 7c9456fbedb4..170d7bb81007 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -13,6 +13,7 @@ import ( g "github.com/onsi/ginkgo" o "github.com/onsi/gomega" + promv1 "github.com/prometheus/client_golang/api/prometheus/v1" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "github.com/prometheus/common/model" @@ -42,6 +43,171 @@ import ( helper "github.com/openshift/origin/test/extended/util/prometheus" ) +var _ = g.Describe("[sig-instrumentation][Late] OpenShift alerting rules", func() { + defer g.GinkgoRecover() + + // These alerts are known to be missing the summary and/or description + // labels. Bugzillas are being filed, and will be linked here. These + // should be fixed one-by-one and removed from this list. + descriptionExceptions := sets.NewString( + "APIRemovedInNextEUSReleaseInUse", + "APIRemovedInNextReleaseInUse", + "CertifiedOperatorsCatalogError", + "CloudCredentialOperatorDeprovisioningFailed", + "CloudCredentialOperatorInsufficientCloudCreds", + "CloudCredentialOperatorProvisioningFailed", + "CloudCredentialOperatorTargetNamespaceMissing", + "ClusterProxyApplySlow", + "CommunityOperatorsCatalogError", + "CsvAbnormalFailedOver2Min", + "CsvAbnormalOver30Min", + "ExtremelyHighIndividualControlPlaneCPU", + "HAProxyDown", + "HAProxyReloadFail", + "HighOverallControlPlaneCPU", + "ImageRegistryStorageReconfigured", + "IngressControllerDegraded", + "IngressControllerUnavailable", + "InstallPlanStepAppliedWithWarnings", + "KubeControllerManagerDown", + "KubeSchedulerDown", + "KubeletHealthState", + "MCDDrainError", + "MCDPivotError", + "MCDRebootError", + "MachineAPIOperatorMetricsCollectionFailing", + "MachineApproverMaxPendingCSRsReached", + "MachineHealthCheckUnterminatedShortCircuit", + "MachineNotYetDeleted", + "MachineWithNoRunningPhase", + "MachineWithoutValidNode", + "MasterNodesHighMemoryUsage", + "NodeProxyApplySlow", + "NodeProxyApplyStale", + "NodeWithoutSDNPod", + "PodDisruptionBudgetAtLimit", + "PodDisruptionBudgetLimit", + "RedhatMarketplaceCatalogError", + "RedhatOperatorsCatalogError", + "SDNPodNotReady", + "SamplesDegraded", + "SamplesImagestreamImportFailing", + "SamplesInvalidConfig", + "SamplesMissingSecret", + "SamplesMissingTBRCredential", + "SamplesRetriesMissingOnImagestreamImportFailing", + "SamplesTBRInaccessibleOnBoot", + "SchedulerLegacyPolicySet", + "SystemMemoryExceedsReservation", + "TechPreviewNoUpgrade", + "etcdBackendQuotaLowSpace", + "etcdExcessiveDatabaseGrowth", + "etcdHighFsyncDurations", + ) + + var alertingRules map[string][]promv1.AlertingRule + oc := exutil.NewCLIWithoutNamespace("prometheus") + + g.BeforeEach(func() { + url, _, bearerToken, ok := helper.LocatePrometheus(oc) + if !ok { + e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test") + } + + if alertingRules == nil { + var err error + + alertingRules, err = helper.FetchAlertingRules(oc, url, bearerToken) + if err != nil { + e2e.Failf("Failed to fetch alerting rules: %v", err) + } + } + }) + + g.It("should have a valid severity label", func() { + helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String { + severityRe := regexp.MustCompile("^critical|warning|info$") + + severity, found := alert.Labels["severity"] + if !found { + return sets.NewString("has no 'severity' label") + } + + if !severityRe.MatchString(string(severity)) { + return sets.NewString( + fmt.Sprintf("has a 'severity' label value of %q which doesn't match %q", + severity, severityRe.String(), + ), + ) + } + + return nil + }) + }) + + g.It("should have description and summary annotations", func() { + helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String { + if descriptionExceptions.Has(alert.Name) { + framework.Logf("Alerting rule %q is known to have missing annotations.") + return nil + } + + violations := sets.NewString() + + if _, found := alert.Annotations["description"]; !found { + // If there's no 'description' annotation, but there is a + // 'message' annotation, suggest renaming it. + if _, found := alert.Annotations["message"]; found { + violations.Insert("has no 'description' annotation, but has a 'message' annotation." + + " OpenShift alerts must use 'description' -- consider renaming the annotation") + } else { + violations.Insert("has no 'description' annotation") + } + } + + if _, found := alert.Annotations["summary"]; !found { + violations.Insert("has no 'summary' annotation") + } + + return violations + }) + }) + + g.It("should have a runbook_url annotation if the alert is critical", func() { + violations := sets.NewString() + + helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String { + severity := string(alert.Labels["severity"]) + runbook := string(alert.Annotations["runbook_url"]) + + if severity == "critical" && runbook == "" { + violations.Insert( + fmt.Sprintf("WARNING: Alert %q is critical and has no 'runbook_url' annotation", alert.Name), + ) + } else if runbook != "" { + // If there's a 'runbook_url' annotation, make sure it's a + // valid URL and that we can fetch the contents. + if err := helper.ValidateURL(runbook, 10*time.Second); err != nil { + violations.Insert( + fmt.Sprintf("WARNING: Alert %q has an invalid 'runbook_url' annotation: %v", + alert.Name, err), + ) + } + } + + return nil // Always return nil, because these aren't required yet. + }) + + // These aren't enforced yet. They are just reporting the errors by + // calling the g.Skip() method. + if len(violations) > 0 { + g.Skip( + fmt.Sprintf("Incompliant rules detected:\n\n%s", strings.Join(violations.List(), "\n")), + ) + } + }) +}) + var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { defer g.GinkgoRecover() var ( diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index 17b5cca87d06..67a933a834d9 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -1703,6 +1703,12 @@ var annotations = map[string]string{ "[Top Level] [sig-instrumentation][Late] Alerts shouldn't report any alerts in firing or pending state apart from Watchdog and AlertmanagerReceiversNotConfigured and have no gaps in Watchdog firing": "shouldn't report any alerts in firing or pending state apart from Watchdog and AlertmanagerReceiversNotConfigured and have no gaps in Watchdog firing [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", + "[Top Level] [sig-instrumentation][Late] OpenShift alerting rules should have a runbook_url annotation if the alert is critical": "should have a runbook_url annotation if the alert is critical [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", + + "[Top Level] [sig-instrumentation][Late] OpenShift alerting rules should have a valid severity label": "should have a valid severity label [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", + + "[Top Level] [sig-instrumentation][Late] OpenShift alerting rules should have description and summary annotations": "should have description and summary annotations [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", + "[Top Level] [sig-instrumentation][sig-builds][Feature:Builds] Prometheus when installed on the cluster should start and expose a secured proxy and verify build metrics": "should start and expose a secured proxy and verify build metrics [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", "[Top Level] [sig-network-edge] DNS should answer A and AAAA queries for a dual-stack service": "should answer A and AAAA queries for a dual-stack service [Suite:openshift/conformance/parallel]", diff --git a/test/extended/util/annotate/rules.go b/test/extended/util/annotate/rules.go index 366abfa730a8..bae4b30e0b28 100644 --- a/test/extended/util/annotate/rules.go +++ b/test/extended/util/annotate/rules.go @@ -208,6 +208,9 @@ var ( `\[sig-instrumentation\] Prometheus when installed on the cluster shouldn't have failing rules evaluation`, `\[sig-instrumentation\] Prometheus when installed on the cluster shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured \[Early\]`, `\[sig-instrumentation\] Prometheus when installed on the cluster when using openshift-sdn should be able to get the sdn ovs flows`, + `\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have a valid severity label`, + `\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have description and summary annotations`, + `\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have a runbook_url annotation if the alert is critical`, `\[sig-instrumentation\]\[Late\] Alerts should have a Watchdog alert in firing state the entire cluster run`, `\[sig-instrumentation\]\[Late\] Alerts shouldn't exceed the 500 series limit of total series sent via telemetry from each cluster`, `\[sig-instrumentation\]\[Late\] Alerts shouldn't report any alerts in firing or pending state apart from Watchdog and AlertmanagerReceiversNotConfigured and have no gaps in Watchdog firing`, diff --git a/test/extended/util/prometheus/helpers.go b/test/extended/util/prometheus/helpers.go index 2ae1fd65a0a2..3086018f7034 100644 --- a/test/extended/util/prometheus/helpers.go +++ b/test/extended/util/prometheus/helpers.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "net/http" "net/url" "strconv" "strings" @@ -12,8 +13,11 @@ import ( g "github.com/onsi/ginkgo" o "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" exutil "github.com/openshift/origin/test/extended/util" + promv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" v1 "k8s.io/api/core/v1" @@ -257,3 +261,104 @@ func ExpectPrometheusEndpoint(namespace, podName, url string) { } o.Expect(err).NotTo(o.HaveOccurred()) } + +// FetchAlertingRules fetchs all alerting rules from the Prometheus at +// the given URL using the given bearer token. The results are returned +// as a map of group names to lists of alerting rules. +func FetchAlertingRules(oc *exutil.CLI, promURL, bearerToken string) (map[string][]promv1.AlertingRule, error) { + ns := oc.SetupNamespace() + execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod") + defer func() { + oc.AdminKubeClient().CoreV1().Pods(ns).Delete(context.Background(), execPod.Name, *metav1.NewDeleteOptions(1)) + }() + + url := fmt.Sprintf("%s/api/v1/rules", promURL) + contents, err := GetBearerTokenURLViaPod(ns, execPod.Name, url, bearerToken) + if err != nil { + return nil, fmt.Errorf("unable to query %s: %v", url, err) + } + + var result struct { + Status string `json:"status"` + Data promv1.RulesResult `json:"data"` + } + if err := json.Unmarshal([]byte(contents), &result); err != nil { + return nil, fmt.Errorf("unable to parse response %q from %s: %v", contents, url, err) + } + + alertingRules := make(map[string][]promv1.AlertingRule) + + for _, rg := range result.Data.Groups { + for _, r := range rg.Rules { + switch v := r.(type) { + case promv1.RecordingRule: + continue + case promv1.AlertingRule: + alertingRules[rg.Name] = append(alertingRules[rg.Name], v) + default: + return nil, fmt.Errorf("unexpected rule of type %T", r) + } + } + } + + return alertingRules, nil +} + +// ValidateURL takes a URL as a string and a timeout. The URL is +// parsed, then fetched until a 200 response is received or the timeout +// is reached. +func ValidateURL(rawURL string, timeout time.Duration) error { + if _, err := url.Parse(rawURL); err != nil { + return err + } + + return wait.PollImmediate(1*time.Second, timeout, func() (done bool, err error) { + resp, err := http.Get(rawURL) + if err != nil { + framework.Logf("validateURL(%q) error: %v", err) + return false, nil + } + + defer resp.Body.Close() + + if resp.StatusCode == 200 { + return true, nil + } + + framework.Logf("validateURL(%q) got non-200 response: %d", rawURL, resp.StatusCode) + return false, nil + }) +} + +// ForEachAlertingRule takes a map of rule group names to a list of +// alerting rules, and for each rule in each group runs the given +// function. The function takes the alerting rule, and returns a set of +// violations, which maye be empty or nil. If after all rules are +// checked, there are any violations, the calling test is failed, and +// all violations are reported. +func ForEachAlertingRule(rules map[string][]promv1.AlertingRule, f func(a promv1.AlertingRule) sets.String) { + allViolations := sets.NewString() + + for group, alerts := range rules { + for _, alert := range alerts { + // The Watchdog alert is special because it is only there to + // test the end-to-end alert flow and it isn't meant to be + // routed to cluster admins. + if alert.Name == "Watchdog" { + continue + } + + if violations := f(alert); violations != nil { + for _, v := range violations.List() { + allViolations.Insert( + fmt.Sprintf("Alerting rule %q (group: %s) %s", alert.Name, group, v), + ) + } + } + } + } + + if len(allViolations) > 0 { + framework.Failf("Incompliant rules detected:\n\n%s", strings.Join(allViolations.List(), "\n")) + } +}