From 90539f9a17f49669c1b16628af63da858bb5f622 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 26 Apr 2021 12:49:35 -0700 Subject: [PATCH] pkg/cvo/metrics: Ignore Degraded for cluster_operator_up ClusterOperatorDown is based on cluster_operator_up, but we also have ClusterOperatorDegraded based on cluster_operator_conditions{condition="Degraded"}. Firing ClusterOperatorDown for operators which are Available=True and Degraded=True confuses users [1]. ClusterOperatorDegraded will also be firing. With this commit, I'm adjusting cluster_operator_up to only care about Available, to decouple the two alerts and bring them in line with their existing "has not been available" and "has been degraded" descriptions. However, IsOperatorStatusConditionTrue requires the condition to be present, and cluster_operator_conditions only creates entries when the conditions are present. To guard against the Degraded-unset condition in ClusterOperatorDegraded, I'm covering with an 'or' [2] and 'group by' [3] guard. So we should have the following cases: * Available unset or != True: ClusterOperatorDown will fire. * Available=True, Degraded unset or != False: ClusterOperatorDegraded will fire. Firing on unset is new in this commit. Not firing ClusterOperatorDown here is new in this commit. * Available=True, Degraded=False: Neither alert fires. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1834551#c0 [2]: https://prometheus.io/docs/prometheus/latest/querying/operators/#logical-set-binary-operators [3]: https://prometheus.io/docs/prometheus/latest/querying/operators/#aggregation-operators --- ...er-version-operator_02_servicemonitor.yaml | 6 +- pkg/cvo/metrics.go | 4 +- pkg/cvo/metrics_test.go | 64 ++++++++++++++++++- 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml index 6ad9e2624..edf1b0a1d 100644 --- a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml +++ b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml @@ -82,7 +82,11 @@ spec: annotations: message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 minutes. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable. expr: | - cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} == 1 + ( + cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} + or on (name) + group by (name) (cluster_operator_up{job="cluster-version-operator"}) + ) == 1 for: 10m labels: severity: critical diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index fdeec7acb..b63bc5dd6 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -358,9 +358,7 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { klog.V(4).Infof("ClusterOperator %s is not setting the 'operator' version", op.Name) } g := m.clusterOperatorUp.WithLabelValues(op.Name, version) - failing := resourcemerge.IsOperatorStatusConditionTrue(op.Status.Conditions, configv1.OperatorDegraded) - available := resourcemerge.IsOperatorStatusConditionTrue(op.Status.Conditions, configv1.OperatorAvailable) - if available && !failing { + if resourcemerge.IsOperatorStatusConditionTrue(op.Status.Conditions, configv1.OperatorAvailable) { g.Set(1) } else { g.Set(0) diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index d74f07fe6..0891b50df 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -170,7 +170,67 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, }, { - name: "collects cluster operator status failure", + name: "collects cluster operator without conditions", + optr: &Operator{ + coLister: &coLister{ + Items: []*configv1.ClusterOperator{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Status: configv1.ClusterOperatorStatus{ + Versions: []configv1.OperandVersion{ + {Name: "operator", Version: "10.1.5-1"}, + {Name: "operand", Version: "10.1.5-2"}, + }, + }, + }, + }, + }, + }, + wants: func(t *testing.T, metrics []prometheus.Metric) { + if len(metrics) != 3 { + t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) + } + expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) + expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"}) + expectMetric(t, metrics[2], 1, map[string]string{"type": ""}) + }, + }, + { + name: "collects cluster operator unavailable", + optr: &Operator{ + coLister: &coLister{ + Items: []*configv1.ClusterOperator{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Status: configv1.ClusterOperatorStatus{ + Versions: []configv1.OperandVersion{ + {Name: "operator", Version: "10.1.5-1"}, + {Name: "operand", Version: "10.1.5-2"}, + }, + Conditions: []configv1.ClusterOperatorStatusCondition{ + {Type: configv1.OperatorAvailable, Status: configv1.ConditionFalse}, + }, + }, + }, + }, + }, + }, + wants: func(t *testing.T, metrics []prometheus.Metric) { + if len(metrics) != 4 { + t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) + } + expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) + expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"}) + expectMetric(t, metrics[2], 0, map[string]string{"name": "test", "condition": "Available"}) + expectMetric(t, metrics[3], 1, map[string]string{"type": ""}) + }, + }, + { + name: "collects cluster operator degraded", optr: &Operator{ coLister: &coLister{ Items: []*configv1.ClusterOperator{ @@ -197,7 +257,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) } expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) - expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"}) + expectMetric(t, metrics[1], 1, map[string]string{"name": "test", "version": "10.1.5-1"}) expectMetric(t, metrics[2], 1, map[string]string{"name": "test", "condition": "Available"}) expectMetric(t, metrics[3], 1, map[string]string{"name": "test", "condition": "Degraded"}) expectMetric(t, metrics[4], 1, map[string]string{"type": ""})