diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml index a1d05dbe0d..da86f29281 100644 --- a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml +++ b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml @@ -40,7 +40,8 @@ spec: rules: - alert: ClusterVersionOperatorDown annotations: - message: Cluster version operator has disappeared from Prometheus target discovery. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible. + summary: Cluster version operator has disappeared from Prometheus target discovery. + description: The operator may be down or disabled. The cluster will not be kept up to date and upgrades will not be possible. Inspect the openshift-cluster-version namespace for events or changes to the cluster-version-operator deployment or pods to diagnose and repair. {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/k8s/cluster/projects/openshift-cluster-version.{{ end }}{{ end }}" }} expr: | absent(up{job="cluster-version-operator"} == 1) for: 10m @@ -48,14 +49,16 @@ spec: severity: critical - alert: CannotRetrieveUpdates annotations: - message: Cluster version operator has not retrieved updates in {{ "{{ $value | humanizeDuration }}" }}. Failure reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"RetrievedUpdates\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0)}}{{label \"reason\" $value}} {{end}}{{end}}{{end}}" }}. {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }} + summary: Cluster version operator has not retrieved updates in {{ "{{ $value | humanizeDuration }}" }}. + description: Failure to retrieve updates means that cluster administrators will need to monitor for available updates on their own or risk falling behind on security or other bugfixes. If the failure is expected, you can clear spec.channel in the ClusterVersion object to tell the cluster-version operator to not retrieve updates. Failure reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"RetrievedUpdates\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0)}}{{label \"reason\" $value}} {{end}}{{end}}{{end}}" }}. {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }} expr: | (time()-cluster_version_operator_update_retrieval_timestamp_seconds) >= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version", condition="RetrievedUpdates", endpoint="metrics", reason!="NoChannel"} labels: severity: warning - alert: UpdateAvailable annotations: - message: Your upstream update recommendation service recommends you update your cluster. For more information refer to 'oc adm upgrade'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}. + summary: Your upstream update recommendation service recommends you update your cluster. + description: For more information refer to 'oc adm upgrade'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}. expr: | cluster_version_available_updates > 0 labels: @@ -64,7 +67,8 @@ spec: rules: - alert: ClusterNotUpgradeable annotations: - message: One or more cluster operators have been blocking minor version cluster upgrades for at least an hour for reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"Upgradeable\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0) (ne (len (label \"reason\" $value)) 0) }}{{label \"reason\" $value}}.{{end}}{{end}}{{end}}"}} {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }} + summary: One or more cluster operators have been blocking minor version cluster upgrades for at least an hour. + description: In most cases, you will still be able to apply patch releases. Reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"Upgradeable\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0) (ne (len (label \"reason\" $value)) 0) }}{{label \"reason\" $value}}.{{end}}{{end}}{{end}}"}} For more information refer to 'oc adm upgrade'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}. expr: | max by (name, condition, endpoint) (cluster_operator_conditions{name="version", condition="Upgradeable", endpoint="metrics"} == 0) for: 60m @@ -72,7 +76,8 @@ spec: severity: warning - alert: ClusterOperatorDown annotations: - message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 minutes. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible. + summary: Cluster operator has not been available for 10 minutes. + description: The {{ "{{ $labels.name }}" }} operator may be down or disabled, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}. expr: | cluster_operator_up{job="cluster-version-operator"} == 0 for: 10m @@ -80,7 +85,8 @@ spec: severity: critical - alert: ClusterOperatorDegraded annotations: - message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 30 minutes. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable. + summary: Cluster operator has been degraded for 30 minutes. + description: The {{ "{{ $labels.name }}" }} operator id degraded because {{ "{{ $labels.reason }}" }}, and the components it manages may have reduced quality of service. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}. expr: | ( cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} @@ -92,7 +98,8 @@ spec: severity: warning - alert: ClusterOperatorFlapping annotations: - message: Cluster operator {{ "{{ $labels.name }}" }} up status is changing often. This might cause upgrades to be unstable. + summary: Cluster operator up status is changing often. + description: The {{ "{{ $labels.name }}" }} operator behavior might cause upgrades to be unstable. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}. expr: | changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2 for: 10m