Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions install/0000_90_cluster-version-operator_02_servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,29 @@ spec:
absent(up{job="cluster-version-operator"} == 1)
for: 10m
labels:
namespace: openshift-cluster-version
severity: critical
- alert: CannotRetrieveUpdates
annotations:
summary: Cluster version operator has not retrieved updates in {{ "{{ $value | humanizeDuration }}" }}.
description: Failure to retrieve updates means that cluster administrators will need to monitor for available updates on their own or risk falling behind on security or other bugfixes. If the failure is expected, you can clear spec.channel in the ClusterVersion object to tell the cluster-version operator to not retrieve updates. Failure reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"RetrievedUpdates\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0)}}{{label \"reason\" $value}} {{end}}{{end}}{{end}}" }}. {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }}
expr: |
(time()-cluster_version_operator_update_retrieval_timestamp_seconds) >= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version", condition="RetrievedUpdates", endpoint="metrics", reason!="NoChannel"}
max by (namespace)
(
(
time()-cluster_version_operator_update_retrieval_timestamp_seconds
) >= 3600
and ignoring(condition, name, reason)
(cluster_operator_conditions{name="version", condition="RetrievedUpdates", endpoint="metrics", reason!="NoChannel"})
)
labels:
severity: warning
- alert: UpdateAvailable
annotations:
summary: Your upstream update recommendation service recommends you update your cluster.
description: For more information refer to 'oc adm upgrade'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
expr: |
sum by (channel,upstream) (cluster_version_available_updates) > 0
sum by (channel, namespace, upstream) (cluster_version_available_updates) > 0
labels:
severity: info
- name: cluster-operators
Expand All @@ -72,7 +80,7 @@ spec:
summary: One or more cluster operators have been blocking minor version cluster upgrades for at least an hour.
description: In most cases, you will still be able to apply patch releases. Reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"Upgradeable\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0) (ne (len (label \"reason\" $value)) 0) }}{{label \"reason\" $value}}.{{end}}{{end}}{{end}}"}} For more information refer to 'oc adm upgrade'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
expr: |
max by (name, condition, endpoint) (cluster_operator_conditions{name="version", condition="Upgradeable", endpoint="metrics"} == 0)
max by (namespace, name, condition, endpoint) (cluster_operator_conditions{name="version", condition="Upgradeable", endpoint="metrics"} == 0)
for: 60m
labels:
severity: info
Expand All @@ -81,7 +89,7 @@ spec:
summary: Cluster operator has not been available for 10 minutes.
description: The {{ "{{ $labels.name }}" }} operator may be down or disabled, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
expr: |
cluster_operator_up{job="cluster-version-operator"} == 0
max by (namespace, name) (cluster_operator_up{job="cluster-version-operator"} == 0)
for: 10m
labels:
severity: critical
Expand All @@ -90,11 +98,14 @@ spec:
summary: Cluster operator has been degraded for 30 minutes.
description: The {{ "{{ $labels.name }}" }} operator is degraded because {{ "{{ $labels.reason }}" }}, and the components it manages may have reduced quality of service. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
expr: |
max by (namespace, name, reason)
(
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"}
or on (name)
group by (name) (cluster_operator_up{job="cluster-version-operator"})
) == 1
(
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"}
or on (namespace, name)
group by (namespace, name) (cluster_operator_up{job="cluster-version-operator"})
) == 1
)
for: 30m
labels:
severity: warning
Expand All @@ -103,7 +114,7 @@ spec:
summary: Cluster operator up status is changing often.
description: The {{ "{{ $labels.name }}" }} operator behavior might cause upgrades to be unstable. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
expr: |
changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2
max by (namespace, name) (changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2)
for: 10m
labels:
severity: warning