Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions test/extended/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

g "github.com/onsi/ginkgo"
o "github.com/onsi/gomega"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/prometheus/common/model"
Expand Down Expand Up @@ -42,6 +43,171 @@ import (
helper "github.com/openshift/origin/test/extended/util/prometheus"
)

var _ = g.Describe("[sig-instrumentation][Late] OpenShift alerting rules", func() {
defer g.GinkgoRecover()

// These alerts are known to be missing the summary and/or description
// labels. Bugzillas are being filed, and will be linked here. These
// should be fixed one-by-one and removed from this list.
descriptionExceptions := sets.NewString(
"APIRemovedInNextEUSReleaseInUse",
"APIRemovedInNextReleaseInUse",
"CertifiedOperatorsCatalogError",
"CloudCredentialOperatorDeprovisioningFailed",
"CloudCredentialOperatorInsufficientCloudCreds",
"CloudCredentialOperatorProvisioningFailed",
"CloudCredentialOperatorTargetNamespaceMissing",
"ClusterProxyApplySlow",
"CommunityOperatorsCatalogError",
"CsvAbnormalFailedOver2Min",
"CsvAbnormalOver30Min",
"ExtremelyHighIndividualControlPlaneCPU",
"HAProxyDown",
"HAProxyReloadFail",
"HighOverallControlPlaneCPU",
"ImageRegistryStorageReconfigured",
"IngressControllerDegraded",
"IngressControllerUnavailable",
"InstallPlanStepAppliedWithWarnings",
"KubeControllerManagerDown",
"KubeSchedulerDown",
"KubeletHealthState",
"MCDDrainError",
"MCDPivotError",
"MCDRebootError",
"MachineAPIOperatorMetricsCollectionFailing",
"MachineApproverMaxPendingCSRsReached",
"MachineHealthCheckUnterminatedShortCircuit",
"MachineNotYetDeleted",
"MachineWithNoRunningPhase",
"MachineWithoutValidNode",
"MasterNodesHighMemoryUsage",
"NodeProxyApplySlow",
"NodeProxyApplyStale",
"NodeWithoutSDNPod",
"PodDisruptionBudgetAtLimit",
"PodDisruptionBudgetLimit",
"RedhatMarketplaceCatalogError",
"RedhatOperatorsCatalogError",
"SDNPodNotReady",
"SamplesDegraded",
"SamplesImagestreamImportFailing",
"SamplesInvalidConfig",
"SamplesMissingSecret",
"SamplesMissingTBRCredential",
"SamplesRetriesMissingOnImagestreamImportFailing",
"SamplesTBRInaccessibleOnBoot",
"SchedulerLegacyPolicySet",
"SystemMemoryExceedsReservation",
"TechPreviewNoUpgrade",
"etcdBackendQuotaLowSpace",
"etcdExcessiveDatabaseGrowth",
"etcdHighFsyncDurations",
)

var alertingRules map[string][]promv1.AlertingRule
oc := exutil.NewCLIWithoutNamespace("prometheus")

g.BeforeEach(func() {
url, _, bearerToken, ok := helper.LocatePrometheus(oc)
if !ok {
e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
}

if alertingRules == nil {
var err error
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: we could have fetchAlertingRules initializing the error variable since it isn't used anywhere else.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was trying to avoid fetching these each time, while still being able to separate each check into its own g.It("should..") block. So alertingRules is actually defined in the outer scope, and it only does the fetch if it hasn't been initialized. Having the call to fetchAlertingRules() initialize err would shadow alertingRules. Let me know if I'm misunderstanding though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah indeed, I missed that alertingRules was defined in the outer scope, makes sense now 👍


alertingRules, err = helper.FetchAlertingRules(oc, url, bearerToken)
if err != nil {
e2e.Failf("Failed to fetch alerting rules: %v", err)
}
}
})

g.It("should have a valid severity label", func() {
helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String {
severityRe := regexp.MustCompile("^critical|warning|info$")

severity, found := alert.Labels["severity"]
if !found {
return sets.NewString("has no 'severity' label")
}

if !severityRe.MatchString(string(severity)) {
return sets.NewString(
fmt.Sprintf("has a 'severity' label value of %q which doesn't match %q",
severity, severityRe.String(),
),
)
}

return nil
})
})

g.It("should have description and summary annotations", func() {
helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String {
if descriptionExceptions.Has(alert.Name) {
framework.Logf("Alerting rule %q is known to have missing annotations.")
return nil
}

violations := sets.NewString()

if _, found := alert.Annotations["description"]; !found {
// If there's no 'description' annotation, but there is a
// 'message' annotation, suggest renaming it.
if _, found := alert.Annotations["message"]; found {
violations.Insert("has no 'description' annotation, but has a 'message' annotation." +
" OpenShift alerts must use 'description' -- consider renaming the annotation")
} else {
violations.Insert("has no 'description' annotation")
}
}

if _, found := alert.Annotations["summary"]; !found {
violations.Insert("has no 'summary' annotation")
}

return violations
})
})

g.It("should have a runbook_url annotation if the alert is critical", func() {
violations := sets.NewString()

helper.ForEachAlertingRule(alertingRules, func(alert promv1.AlertingRule) sets.String {
severity := string(alert.Labels["severity"])
runbook := string(alert.Annotations["runbook_url"])

if severity == "critical" && runbook == "" {
violations.Insert(
fmt.Sprintf("WARNING: Alert %q is critical and has no 'runbook_url' annotation", alert.Name),
)
} else if runbook != "" {
// If there's a 'runbook_url' annotation, make sure it's a
// valid URL and that we can fetch the contents.
if err := helper.ValidateURL(runbook, 10*time.Second); err != nil {
violations.Insert(
fmt.Sprintf("WARNING: Alert %q has an invalid 'runbook_url' annotation: %v",
alert.Name, err),
)
}
}

return nil // Always return nil, because these aren't required yet.
})

// These aren't enforced yet. They are just reporting the errors by
// calling the g.Skip() method.
if len(violations) > 0 {
g.Skip(
fmt.Sprintf("Incompliant rules detected:\n\n%s", strings.Join(violations.List(), "\n")),
)
}
})
})

var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
defer g.GinkgoRecover()
var (
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions test/extended/util/annotate/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ var (
`\[sig-instrumentation\] Prometheus when installed on the cluster shouldn't have failing rules evaluation`,
`\[sig-instrumentation\] Prometheus when installed on the cluster shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured \[Early\]`,
`\[sig-instrumentation\] Prometheus when installed on the cluster when using openshift-sdn should be able to get the sdn ovs flows`,
`\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have a valid severity label`,
`\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have description and summary annotations`,
`\[sig-instrumentation\]\[Late\] OpenShift alerting rules should have a runbook_url annotation if the alert is critical`,
`\[sig-instrumentation\]\[Late\] Alerts should have a Watchdog alert in firing state the entire cluster run`,
`\[sig-instrumentation\]\[Late\] Alerts shouldn't exceed the 500 series limit of total series sent via telemetry from each cluster`,
`\[sig-instrumentation\]\[Late\] Alerts shouldn't report any alerts in firing or pending state apart from Watchdog and AlertmanagerReceiversNotConfigured and have no gaps in Watchdog firing`,
Expand Down
105 changes: 105 additions & 0 deletions test/extended/util/prometheus/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/url"
"strconv"
"strings"
Expand All @@ -12,8 +13,11 @@ import (
g "github.com/onsi/ginkgo"
o "github.com/onsi/gomega"
"k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"

exutil "github.com/openshift/origin/test/extended/util"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"

v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -257,3 +261,104 @@ func ExpectPrometheusEndpoint(namespace, podName, url string) {
}
o.Expect(err).NotTo(o.HaveOccurred())
}

// FetchAlertingRules fetchs all alerting rules from the Prometheus at
// the given URL using the given bearer token. The results are returned
// as a map of group names to lists of alerting rules.
func FetchAlertingRules(oc *exutil.CLI, promURL, bearerToken string) (map[string][]promv1.AlertingRule, error) {
ns := oc.SetupNamespace()
execPod := exutil.CreateExecPodOrFail(oc.AdminKubeClient(), ns, "execpod")
defer func() {
oc.AdminKubeClient().CoreV1().Pods(ns).Delete(context.Background(), execPod.Name, *metav1.NewDeleteOptions(1))
}()

url := fmt.Sprintf("%s/api/v1/rules", promURL)
contents, err := GetBearerTokenURLViaPod(ns, execPod.Name, url, bearerToken)
if err != nil {
return nil, fmt.Errorf("unable to query %s: %v", url, err)
}

var result struct {
Status string `json:"status"`
Data promv1.RulesResult `json:"data"`
}
if err := json.Unmarshal([]byte(contents), &result); err != nil {
return nil, fmt.Errorf("unable to parse response %q from %s: %v", contents, url, err)
}

alertingRules := make(map[string][]promv1.AlertingRule)

for _, rg := range result.Data.Groups {
for _, r := range rg.Rules {
switch v := r.(type) {
case promv1.RecordingRule:
continue
case promv1.AlertingRule:
alertingRules[rg.Name] = append(alertingRules[rg.Name], v)
default:
return nil, fmt.Errorf("unexpected rule of type %T", r)
}
}
}

return alertingRules, nil
}

// ValidateURL takes a URL as a string and a timeout. The URL is
// parsed, then fetched until a 200 response is received or the timeout
// is reached.
func ValidateURL(rawURL string, timeout time.Duration) error {
if _, err := url.Parse(rawURL); err != nil {
return err
}

return wait.PollImmediate(1*time.Second, timeout, func() (done bool, err error) {
resp, err := http.Get(rawURL)
if err != nil {
framework.Logf("validateURL(%q) error: %v", err)
return false, nil
}

defer resp.Body.Close()

if resp.StatusCode == 200 {
return true, nil
}

framework.Logf("validateURL(%q) got non-200 response: %d", rawURL, resp.StatusCode)
return false, nil
})
}

// ForEachAlertingRule takes a map of rule group names to a list of
// alerting rules, and for each rule in each group runs the given
// function. The function takes the alerting rule, and returns a set of
// violations, which maye be empty or nil. If after all rules are
// checked, there are any violations, the calling test is failed, and
// all violations are reported.
func ForEachAlertingRule(rules map[string][]promv1.AlertingRule, f func(a promv1.AlertingRule) sets.String) {
allViolations := sets.NewString()

for group, alerts := range rules {
for _, alert := range alerts {
// The Watchdog alert is special because it is only there to
// test the end-to-end alert flow and it isn't meant to be
// routed to cluster admins.
if alert.Name == "Watchdog" {
continue
}

if violations := f(alert); violations != nil {
for _, v := range violations.List() {
allViolations.Insert(
fmt.Sprintf("Alerting rule %q (group: %s) %s", alert.Name, group, v),
)
}
}
}
}

if len(allViolations) > 0 {
framework.Failf("Incompliant rules detected:\n\n%s", strings.Join(allViolations.List(), "\n"))
}
}