diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml index c2f609257..3808331a8 100644 --- a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml +++ b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml @@ -87,9 +87,9 @@ spec: - alert: ClusterOperatorDown annotations: summary: Cluster operator has not been available for 10 minutes. - description: The {{ "{{ $labels.name }}" }} operator may be down or disabled, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}. + description: The {{ "{{ $labels.name }}" }} operator may be down or disabled because {{ "${{ $labels.reason }}" }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}. expr: | - max by (namespace, name) (cluster_operator_up{job="cluster-version-operator"} == 0) + max by (namespace, name, reason) (cluster_operator_up{job="cluster-version-operator"} == 0) for: 10m labels: severity: critical diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 8b251d2cb..78a96642d 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -94,8 +94,8 @@ version for 'cluster', or empty for 'initial'. }, []string{"name"}), clusterOperatorUp: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_operator_up", - Help: "Reports key highlights of the active cluster operators.", - }, []string{"name", "version"}), + Help: "1 if a cluster operator is Available=True. 0 otherwise, including if a cluster operator sets no Available condition. The 'version' label tracks the 'operator' version. The 'reason' label is passed through from the Available condition, unless the cluster operator sets no Available condition, in which case NoAvailableCondition is used.", + }, []string{"name", "version", "reason"}), clusterOperatorConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_operator_conditions", Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.", @@ -339,7 +339,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.version.WithLabelValues("", "", "", "").Desc() ch <- m.availableUpdates.WithLabelValues("", "").Desc() ch <- m.capability.WithLabelValues("").Desc() - ch <- m.clusterOperatorUp.WithLabelValues("", "").Desc() + ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() @@ -489,12 +489,16 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { if version == "" { klog.V(2).Infof("ClusterOperator %s is not setting the 'operator' version", op.Name) } - g := m.clusterOperatorUp.WithLabelValues(op.Name, version) - if resourcemerge.IsOperatorStatusConditionTrue(op.Status.Conditions, configv1.OperatorAvailable) { - g.Set(1) - } else { - g.Set(0) + var isUp float64 + reason := "NoAvailableCondition" + if condition := resourcemerge.FindOperatorStatusCondition(op.Status.Conditions, configv1.OperatorAvailable); condition != nil { + reason = condition.Reason + if condition.Status == configv1.ConditionTrue { + isUp = 1 + } } + g := m.clusterOperatorUp.WithLabelValues(op.Name, version, reason) + g.Set(isUp) ch <- g for _, condition := range op.Status.Conditions { if condition.Status != configv1.ConditionFalse && condition.Status != configv1.ConditionTrue { diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index 0891b50df..2bab3c591 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -193,7 +193,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) } expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) - expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"}) + expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1", "reason": "NoAvailableCondition"}) expectMetric(t, metrics[2], 1, map[string]string{"type": ""}) }, },