diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go index 170d7bb81007..7c9456fbedb4 100644 --- a/test/extended/prometheus/prometheus.go +++ b/test/extended/prometheus/prometheus.go @@ -13,7 +13,6 @@ import ( g "github.com/onsi/ginkgo" o "github.com/onsi/gomega" - promv1 "github.com/prometheus/client_golang/api/prometheus/v1" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "github.com/prometheus/common/model" @@ -43,171 +42,6 @@ import ( helper "github.com/openshift/origin/test/extended/util/prometheus" ) -var _ = g.Describe("[sig-instrumentation][Late] OpenShift alerting rules", func() { - defer g.GinkgoRecover() - - // These alerts are known to be missing the summary and/or description - // labels. Bugzillas are being filed, and will be linked here. These - // should be fixed one-by-one and removed from this list. - descriptionExceptions := sets.NewString( - "APIRemovedInNextEUSReleaseInUse", - "APIRemovedInNextReleaseInUse", - "CertifiedOperatorsCatalogError", - "CloudCredentialOperatorDeprovisioningFailed", - "CloudCredentialOperatorInsufficientCloudCreds", - "CloudCredentialOperatorProvisioningFailed", - "CloudCredentialOperatorTargetNamespaceMissing", - "ClusterProxyApplySlow", - "CommunityOperatorsCatalogError", - "CsvAbnormalFailedOver2Min", - "CsvAbnormalOver30Min", - "ExtremelyHighIndividualControlPlaneCPU", - "HAProxyDown", - "HAProxyReloadFail", - "HighOverallControlPlaneCPU", - "ImageRegistryStorageReconfigured", - "IngressControllerDegraded", - "IngressControllerUnavailable", - "InstallPlanStepAppliedWithWarnings", - "KubeControllerManagerDown", - "KubeSchedulerDown", - "KubeletHealthState", - "MCDDrainError", - "MCDPivotError", - "MCDRebootError", - "MachineAPIOperatorMetricsCollectionFailing", - "MachineApproverMaxPendingCSRsReached", - "MachineHealthCheckUnterminatedShortCircuit", - "MachineNotYetDeleted", - "MachineWithNoRunningPhase", - "MachineWithoutValidNode", - "MasterNodesHighMemoryUsage", - "NodeProxyApplySlow", - "NodeProxyApplyStale", - "NodeWithoutSDNPod", - "PodDisruptionBudgetAtLimit", - "PodDisruptionBudgetLimit", - "RedhatMarketplaceCatalogError", - "RedhatOperatorsCatalogError", - "SDNPodNotReady", - "SamplesDegraded", - "SamplesImagestreamImportFailing", - "SamplesInvalidConfig", - "SamplesMissingSecret", - "SamplesMissingTBRCredential", - "SamplesRetriesMissingOnImagestreamImportFailing", - "SamplesTBRInaccessibleOnBoot", - "SchedulerLegacyPolicySet", - "SystemMemoryExceedsReservation", - "TechPreviewNoUpgrade", - "etcdBackendQuotaLowSpace", - "etcdExcessiveDatabaseGrowth", - "etcdHighFsyncDurations", - ) - - var alertingRules map[string][]promv1.AlertingRule - oc := exutil.NewCLIWithoutNamespace("prometheus") - - g.BeforeEach(func() { - url, _, bearerToken, ok := helper.LocatePrometheus(oc) - if !ok { - e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test") - } - - if alertingRules == nil { - var err error - - alertingRules, err = helper.FetchAlertingRules(oc, url, bearerToken) - if err != nil { - e2e.Failf("Failed to fetch alerting rules: %v", err) - } - } - }) - - g.It("should have a valid severity label", func() { - helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String { - severityRe := regexp.MustCompile("^critical|warning|info$") - - severity, found := alert.Labels["severity"] - if !found { - return sets.NewString("has no 'severity' label") - } - - if !severityRe.MatchString(string(severity)) { - return sets.NewString( - fmt.Sprintf("has a 'severity' label value of %q which doesn't match %q", - severity, severityRe.String(), - ), - ) - } - - return nil - }) - }) - - g.It("should have description and summary annotations", func() { - helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String { - if descriptionExceptions.Has(alert.Name) { - framework.Logf("Alerting rule %q is known to have missing annotations.") - return nil - } - - violations := sets.NewString() - - if _, found := alert.Annotations["description"]; !found { - // If there's no 'description' annotation, but there is a - // 'message' annotation, suggest renaming it. - if _, found := alert.Annotations["message"]; found { - violations.Insert("has no 'description' annotation, but has a 'message' annotation." + - " OpenShift alerts must use 'description' -- consider renaming the annotation") - } else { - violations.Insert("has no 'description' annotation") - } - } - - if _, found := alert.Annotations["summary"]; !found { - violations.Insert("has no 'summary' annotation") - } - - return violations - }) - }) - - g.It("should have a runbook_url annotation if the alert is critical", func() { - violations := sets.NewString() - - helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String { - severity := string(alert.Labels["severity"]) - runbook := string(alert.Annotations["runbook_url"]) - - if severity == "critical" && runbook == "" { - violations.Insert( - fmt.Sprintf("WARNING: Alert %q is critical and has no 'runbook_url' annotation", alert.Name), - ) - } else if runbook != "" { - // If there's a 'runbook_url' annotation, make sure it's a - // valid URL and that we can fetch the contents. - if err := helper.ValidateURL(runbook, 10*time.Second); err != nil { - violations.Insert( - fmt.Sprintf("WARNING: Alert %q has an invalid 'runbook_url' annotation: %v", - alert.Name, err), - ) - } - } - - return nil // Always return nil, because these aren't required yet. - }) - - // These aren't enforced yet. They are just reporting the errors by - // calling the g.Skip() method. - if len(violations) > 0 { - g.Skip( - fmt.Sprintf("Incompliant rules detected:\n\n%s", strings.Join(violations.List(), "\n")), - ) - } - }) -}) - var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() { defer g.GinkgoRecover() var ( diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index 84ee6f52ec22..99cef9a6951c 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -1717,12 +1717,6 @@ var annotations = map[string]string{ "[Top Level] [sig-instrumentation][Late] Alerts shouldn't report any alerts in firing or pending state apart from Watchdog and AlertmanagerReceiversNotConfigured and have no gaps in Watchdog firing": "shouldn't report any alerts in firing or pending state apart from Watchdog and AlertmanagerReceiversNotConfigured and have no gaps in Watchdog firing [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", - "[Top Level] [sig-instrumentation][Late] OpenShift alerting rules should have a runbook_url annotation if the alert is critical": "should have a runbook_url annotation if the alert is critical [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", - - "[Top Level] [sig-instrumentation][Late] OpenShift alerting rules should have a valid severity label": "should have a valid severity label [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", - - "[Top Level] [sig-instrumentation][Late] OpenShift alerting rules should have description and summary annotations": "should have description and summary annotations [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", - "[Top Level] [sig-instrumentation][sig-builds][Feature:Builds] Prometheus when installed on the cluster should start and expose a secured proxy and verify build metrics": "should start and expose a secured proxy and verify build metrics [Skipped:Disconnected] [Suite:openshift/conformance/parallel]", "[Top Level] [sig-network-edge] DNS should answer A and AAAA queries for a dual-stack service": "should answer A and AAAA queries for a dual-stack service [Suite:openshift/conformance/parallel]", diff --git a/test/extended/util/annotate/rules.go b/test/extended/util/annotate/rules.go index f4cac6b447f2..5f489412758f 100644 --- a/test/extended/util/annotate/rules.go +++ b/test/extended/util/annotate/rules.go @@ -212,9 +212,6 @@ var ( `\[sig-instrumentation\] Prometheus when installed on the cluster shouldn't have failing rules evaluation`, `\[sig-instrumentation\] Prometheus when installed on the cluster shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured \[Early\]`, `\[sig-instrumentation\] Prometheus when installed on the cluster when using openshift-sdn should be able to get the sdn ovs flows`, - `\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have a valid severity label`, - `\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have description and summary annotations`, - `\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have a runbook_url annotation if the alert is critical`, `\[sig-instrumentation\]\[Late\] Alerts should have a Watchdog alert in firing state the entire cluster run`, `\[sig-instrumentation\]\[Late\] Alerts shouldn't exceed the 500 series limit of total series sent via telemetry from each cluster`, `\[sig-instrumentation\]\[Late\] Alerts shouldn't report any alerts in firing or pending state apart from Watchdog and AlertmanagerReceiversNotConfigured and have no gaps in Watchdog firing`, diff --git a/test/extended/util/prometheus/helpers.go b/test/extended/util/prometheus/helpers.go index 3086018f7034..2ae1fd65a0a2 100644 --- a/test/extended/util/prometheus/helpers.go +++ b/test/extended/util/prometheus/helpers.go @@ -4,7 +4,6 @@ import ( "context" "encoding/json" "fmt" - "net/http" "net/url" "strconv" "strings" @@ -13,11 +12,8 @@ import ( g "github.com/onsi/ginkgo" o "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/util/errors" - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/apimachinery/pkg/util/wait" exutil "github.com/openshift/origin/test/extended/util" - promv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" v1 "k8s.io/api/core/v1" @@ -261,104 +257,3 @@ func ExpectPrometheusEndpoint(namespace, podName, url string) { } o.Expect(err).NotTo(o.HaveOccurred()) } - -// FetchAlertingRules fetchs all alerting rules from the Prometheus at -// the given URL using the given bearer token. The results are returned -// as a map of group names to lists of alerting rules. -func FetchAlertingRules(oc *exutil.CLI, promURL, bearerToken string) (map[string][]promv1.AlertingRule, error) { - ns := oc.SetupNamespace() - execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod") - defer func() { - oc.AdminKubeClient().CoreV1().Pods(ns).Delete(context.Background(), execPod.Name, *metav1.NewDeleteOptions(1)) - }() - - url := fmt.Sprintf("%s/api/v1/rules", promURL) - contents, err := GetBearerTokenURLViaPod(ns, execPod.Name, url, bearerToken) - if err != nil { - return nil, fmt.Errorf("unable to query %s: %v", url, err) - } - - var result struct { - Status string `json:"status"` - Data promv1.RulesResult `json:"data"` - } - if err := json.Unmarshal([]byte(contents), &result); err != nil { - return nil, fmt.Errorf("unable to parse response %q from %s: %v", contents, url, err) - } - - alertingRules := make(map[string][]promv1.AlertingRule) - - for _, rg := range result.Data.Groups { - for _, r := range rg.Rules { - switch v := r.(type) { - case promv1.RecordingRule: - continue - case promv1.AlertingRule: - alertingRules[rg.Name] = append(alertingRules[rg.Name], v) - default: - return nil, fmt.Errorf("unexpected rule of type %T", r) - } - } - } - - return alertingRules, nil -} - -// ValidateURL takes a URL as a string and a timeout. The URL is -// parsed, then fetched until a 200 response is received or the timeout -// is reached. -func ValidateURL(rawURL string, timeout time.Duration) error { - if _, err := url.Parse(rawURL); err != nil { - return err - } - - return wait.PollImmediate(1*time.Second, timeout, func() (done bool, err error) { - resp, err := http.Get(rawURL) - if err != nil { - framework.Logf("validateURL(%q) error: %v", err) - return false, nil - } - - defer resp.Body.Close() - - if resp.StatusCode == 200 { - return true, nil - } - - framework.Logf("validateURL(%q) got non-200 response: %d", rawURL, resp.StatusCode) - return false, nil - }) -} - -// ForEachAlertingRule takes a map of rule group names to a list of -// alerting rules, and for each rule in each group runs the given -// function. The function takes the alerting rule, and returns a set of -// violations, which maye be empty or nil. If after all rules are -// checked, there are any violations, the calling test is failed, and -// all violations are reported. -func ForEachAlertingRule(rules map[string][]promv1.AlertingRule, f func(a promv1.AlertingRule) sets.String) { - allViolations := sets.NewString() - - for group, alerts := range rules { - for _, alert := range alerts { - // The Watchdog alert is special because it is only there to - // test the end-to-end alert flow and it isn't meant to be - // routed to cluster admins. - if alert.Name == "Watchdog" { - continue - } - - if violations := f(alert); violations != nil { - for _, v := range violations.List() { - allViolations.Insert( - fmt.Sprintf("Alerting rule %q (group: %s) %s", alert.Name, group, v), - ) - } - } - } - } - - if len(allViolations) > 0 { - framework.Failf("Incompliant rules detected:\n\n%s", strings.Join(allViolations.List(), "\n")) - } -}