Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/user/status.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ This document describes those conditions and, where appropriate, suggests possib
When `RetrievedUpdates` is `True`, the CVO is succesfully retrieving updates, which is good.
When `RetrievedUpdates` is `False`, `reason` will be set to explain why, as discussed in the following subsections.
In all cases, the impact is that the cluster will not be able to retrieve recommended updates, so cluster admins will need to monitor for available updates on their own or risk falling behind on security or other bugfixes.
When CVO is unable to retrieve recommended updates the CannotRetrieveUpdates alert will fire containing the reason. This alert will not fire when the reason updates cannot be retrieved is NoChannel.

### NoUpstream

Expand Down
11 changes: 9 additions & 2 deletions install/0000_90_cluster-version-operator_02_servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,26 @@ spec:
for: 10m
labels:
severity: critical
- alert: CannotRetrieveUpdates
annotations:
message: Cluster version operator has not retrieved updates in {{ "{{ $value | humanizeDuration }}" }}. Failure reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"RetrievedUpdates\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0)}}{{label \"reason\" $value}} {{end}}{{end}}{{end}}" }}. {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }}
expr: |
(time()-cluster_version_operator_update_retrieval_timestamp_seconds) >= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version", condition="RetrievedUpdates", endpoint="metrics", reason!="NoChannel"}
labels:
severity: critical
- name: cluster-operators
rules:
- alert: ClusterOperatorDown
annotations:
message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 mins. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 minutes. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
expr: |
cluster_operator_up{job="cluster-version-operator"} == 0
for: 10m
labels:
severity: critical
- alert: ClusterOperatorDegraded
annotations:
message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 mins. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable.
message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 minutes. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable.
expr: |
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} == 1
for: 10m
Expand Down
4 changes: 3 additions & 1 deletion pkg/cvo/availableupdates.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"github.com/openshift/cluster-version-operator/pkg/cincinnati"
)

const noChannel string = "NoChannel"

// syncAvailableUpdates attempts to retrieve the latest updates and update the status of the ClusterVersion
// object. It will set the RetrievedUpdates condition. Updates are only checked if it has been more than
// the minimumUpdateCheckInterval since the last check.
Expand Down Expand Up @@ -179,7 +181,7 @@ func calculateAvailableUpdatesStatus(clusterID string, proxyURL *url.URL, tlsCon

if len(channel) == 0 {
return nil, configv1.ClusterOperatorStatusCondition{
Type: configv1.RetrievedUpdates, Status: configv1.ConditionFalse, Reason: "NoChannel",
Type: configv1.RetrievedUpdates, Status: configv1.ConditionFalse, Reason: noChannel,
Message: "The update channel has not been configured.",
}
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/cvo/cvo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ import (
"k8s.io/apimachinery/pkg/util/diff"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/discovery"
"k8s.io/client-go/rest"
kfake "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/rest"
ktesting "k8s.io/client-go/testing"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
Expand Down Expand Up @@ -2375,7 +2375,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
Condition: configv1.ClusterOperatorStatusCondition{
Type: configv1.RetrievedUpdates,
Status: configv1.ConditionFalse,
Reason: "NoChannel",
Reason: noChannel,
Message: "The update channel has not been configured.",
},
},
Expand Down
28 changes: 22 additions & 6 deletions pkg/cvo/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
Expand All @@ -28,12 +29,13 @@ type operatorMetrics struct {

conditionTransitions map[conditionKey]int

version *prometheus.GaugeVec
availableUpdates *prometheus.GaugeVec
clusterOperatorUp *prometheus.GaugeVec
clusterOperatorConditions *prometheus.GaugeVec
clusterOperatorConditionTransitions *prometheus.GaugeVec
clusterInstaller *prometheus.GaugeVec
version *prometheus.GaugeVec
availableUpdates *prometheus.GaugeVec
clusterOperatorUp *prometheus.GaugeVec
clusterOperatorConditions *prometheus.GaugeVec
clusterOperatorConditionTransitions *prometheus.GaugeVec
clusterInstaller *prometheus.GaugeVec
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
}

func newOperatorMetrics(optr *Operator) *operatorMetrics {
Expand Down Expand Up @@ -83,6 +85,10 @@ version for 'cluster', or empty for 'initial'.
Name: "cluster_installer",
Help: "Reports info about the installation process and, if applicable, the install tool.",
}, []string{"type", "version", "invoker"}),
clusterVersionOperatorUpdateRetrievalTimestampSeconds: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_version_operator_update_retrieval_timestamp_seconds",
Help: "Reports when updates were last succesfully retrieved.",
}, []string{"name"}),
}
}

Expand Down Expand Up @@ -133,6 +139,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
}

func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
Expand Down Expand Up @@ -297,6 +304,15 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
g.Set(1.0)
ch <- g
}

// check ability to retrieve recommended updates
if availableUpdates := m.optr.getAvailableUpdates(); availableUpdates != nil {
g := m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("")
g.Set(float64(availableUpdates.LastSyncOrConfigChange.Unix()))
ch <- g
} else {
klog.Warningf("availableUpdates is nil")
}
}

func gaugeFromInstallConfigMap(cm *corev1.ConfigMap, gauge *prometheus.GaugeVec, installType string) prometheus.Gauge {
Expand Down