From 59c1513208c8cd746cbce735427f968bed54ff0b Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Tue, 29 Aug 2023 01:21:40 +0200 Subject: [PATCH 1/5] OCPBUGS-9050: Alert on failing conditional update risk evaluation --- ...er-version-operator_02_servicemonitor.yaml | 9 + pkg/cvo/availableupdates.go | 2 +- pkg/cvo/metrics.go | 33 ++- pkg/cvo/metrics_test.go | 222 ++++++++++++++++++ pkg/cvo/status.go | 4 + 5 files changed, 268 insertions(+), 2 deletions(-) diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml index 4878257fb..ec4c80230 100644 --- a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml +++ b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml @@ -127,3 +127,12 @@ spec: for: 10m labels: severity: warning + - alert: CannotEvaluateConditionalUpdates + annotations: + summary: Cluster Version Operator cannot evaluate conditional update matches for {{ "{{ $value | humanizeDuration }}" }}. + description: Failure to evaluate conditional update matches means that Cluster Version Operator cannot decide whether an update path is recommended or not. + expr: | + (cluster_version_conditional_updates_recommended_conditions_seconds{recommended="unknown"}) >= 3000 + for: 10m + labels: + severity: warning diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 90495c4cd..e6af39383 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -428,7 +428,7 @@ func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) stri func evaluateConditionalUpdate(ctx context.Context, conditionalUpdate *configv1.ConditionalUpdate, conditionRegistry clusterconditions.ConditionRegistry) *metav1.Condition { recommended := &metav1.Condition{ - Type: "Recommended", + Type: ConditionalUpdateConditionTypeRecommended, } messages := []string{} for _, risk := range conditionalUpdate.Risks { diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 2583c315e..bd04f0baa 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -12,6 +12,7 @@ import ( "net/http" "os" "path/filepath" + "strings" "time" "github.com/prometheus/client_golang/prometheus" @@ -56,11 +57,16 @@ type operatorMetrics struct { clusterOperatorConditionTransitions *prometheus.GaugeVec clusterInstaller *prometheus.GaugeVec clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec + clusterVersionConditionalUpdatesRecommendedConditions *prometheus.GaugeVec + + // nowFunc is used to override the time.Now() function for testing. + nowFunc func() time.Time } func newOperatorMetrics(optr *Operator) *operatorMetrics { return &operatorMetrics{ - optr: optr, + optr: optr, + nowFunc: time.Now, conditionTransitions: make(map[conditionKey]int), @@ -114,6 +120,10 @@ penultimate completed version for 'completed'. Name: "cluster_version_operator_update_retrieval_timestamp_seconds", Help: "Reports when updates were last successfully retrieved.", }, []string{"name"}), + clusterVersionConditionalUpdatesRecommendedConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_version_conditional_updates_recommended_conditions_seconds", + Help: "Reports the unknown conditional updates conditions", + }, []string{"version", "recommended", "reason"}), } } @@ -346,8 +356,27 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc() + ch <- m.clusterVersionConditionalUpdatesRecommendedConditions.WithLabelValues("", "", "").Desc() } +func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, updates []configv1.ConditionalUpdate) { + for _, update := range updates { + for _, condition := range update.Conditions { + if condition.Type != ConditionalUpdateConditionTypeRecommended { + continue + } + + g := m.clusterVersionConditionalUpdatesRecommendedConditions + gauge := g.WithLabelValues(update.Release.Version, strings.ToLower(string(condition.Status)), condition.Reason) + gauge.Set(m.nowFunc().Sub(condition.LastTransitionTime.Time).Seconds()) + ch <- gauge + } + } +} + +// Collect collects metrics from the operator into the channel ch. Some metrics +// are taken relative to the when parameter, which should be the time at which +// the metrics were collected. func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { current := m.optr.currentVersion() var completed configv1.UpdateHistory @@ -468,6 +497,8 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { } ch <- g } + + m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates) } g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version) diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index 2bab3c591..4758df88a 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -23,6 +23,44 @@ func Test_operatorMetrics_Collect(t *testing.T) { optr *Operator wants func(*testing.T, []prometheus.Metric) }{ + { + name: "collect conditional update recommendations", + optr: &Operator{ + name: "test", + releaseCreated: time.Unix(2, 0), + cvLister: &cvLister{ + Items: []*configv1.ClusterVersion{ + { + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Status: configv1.ClusterVersionStatus{ + ConditionalUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.5.6", Image: "pullspec/4.5.6"}, + Conditions: []metav1.Condition{ + { + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Unix(3, 0)), + Reason: "RiskDoesNotApply", + Message: "Risk is not risky so you do not risk", + }, + }, + }, + }, + }, + }, + }, + }, + }, + wants: func(t *testing.T, metrics []prometheus.Metric) { + if len(metrics) != 5 { + t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) + } + // optr.releaseCreated is epoch+2s, LastTransitionTime is epoch+3s + // we mock the evaluation time to be one minute after optr.releaseCreated => expect 59s + expectMetric(t, metrics[2], 59, map[string]string{"reason": "RiskDoesNotApply", "recommended": "true", "version": "4.5.6"}) + }, + }, { name: "collects current version", optr: &Operator{ @@ -610,6 +648,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { tt.optr.cmConfigLister = &cmConfigLister{} } m := newOperatorMetrics(tt.optr) + m.nowFunc = func() time.Time { return tt.optr.releaseCreated.Add(time.Minute) } descCh := make(chan *prometheus.Desc) go func() { for range descCh { @@ -727,6 +766,189 @@ func Test_operatorMetrics_CollectTransitions(t *testing.T) { } } +func TestCollectUnknownConditionalUpdates(t *testing.T) { + anchorTime := time.Now() + + type valueWithLabels struct { + value float64 + labels map[string]string + } + testCases := []struct { + name string + updates []configv1.ConditionalUpdate + evaluate time.Time + expected []valueWithLabels + }{ + { + name: "no conditional updates", + updates: []configv1.ConditionalUpdate{}, + evaluate: anchorTime.Add(time.Minute), + expected: []valueWithLabels{}, + }, + { + name: "no recommended conditions on updates", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: "SomethingElseThanRecommended", + Status: "Unknown", + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "NoIdea", + Message: "No Idea", + }}, + }, + }, + expected: []valueWithLabels{}, + }, + { + name: "recommended false", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "RiskApplies", + Message: "Risk is risky so do not risk it", + }}, + }, + }, + evaluate: anchorTime.Add(time.Minute), + expected: []valueWithLabels{{ + value: 60, + labels: map[string]string{"version": "4.13.1", "recommended": "false", "reason": "RiskApplies"}, + }}, + }, + { + name: "recommended true", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "RiskDoesNotApply", + Message: "Risk is not risky so you do not risk", + }}, + }, + }, + evaluate: anchorTime.Add(time.Minute), + expected: []valueWithLabels{{ + value: 60, + labels: map[string]string{"version": "4.13.1", "recommended": "true", "reason": "RiskDoesNotApply"}, + }}, + }, + { + name: "recommended unknown", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "EvaluationFailed", + Message: "No idea sorry", + }}, + }, + }, + evaluate: anchorTime.Add(time.Minute), + expected: []valueWithLabels{{ + value: 60, + labels: map[string]string{"version": "4.13.1", "recommended": "unknown", "reason": "EvaluationFailed"}, + }}, + }, + { + name: "multiple versions with different transition times", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "RiskApplies", + Message: "Risk is risky so do not risk it", + }}, + }, + { + Release: configv1.Release{Version: "4.13.2", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.NewTime(anchorTime.Add(time.Minute)), + Reason: "RiskDoesNotApply", + Message: "Risk is not risky", + }}, + }, + { + Release: configv1.Release{Version: "4.13.3", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(anchorTime.Add(time.Minute * 2)), + Reason: "EvaluationFailed", + Message: "Metrics stack was abducted by aliens", + }}, + }, + }, + evaluate: anchorTime.Add(5 * time.Minute), + expected: []valueWithLabels{ + { + value: 5 * 60, + labels: map[string]string{"version": "4.13.1", "recommended": "false", "reason": "RiskApplies"}, + }, + { + value: 4 * 60, + labels: map[string]string{"version": "4.13.2", "recommended": "true", "reason": "RiskDoesNotApply"}, + }, + { + value: 3 * 60, + labels: map[string]string{"version": "4.13.3", "recommended": "unknown", "reason": "EvaluationFailed"}, + }, + }, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + optr := &Operator{} + m := newOperatorMetrics(optr) + m.nowFunc = func() time.Time { return tc.evaluate } + ch := make(chan prometheus.Metric) + + go func() { + m.collectConditionalUpdates(ch, tc.updates) + close(ch) + }() + + var collected []prometheus.Metric + for item := range ch { + collected = append(collected, item) + } + + if lenC, lenE := len(collected), len(tc.expected); lenC != lenE { + + t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected)) + } + for i := range tc.expected { + expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels) + } + }) + } +} + func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) { t.Helper() var d dto.Metric diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 76ccff25e..3e2d9c64b 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -31,6 +31,10 @@ const ( // and indicates the cluster is not healthy. ClusterStatusFailing = configv1.ClusterStatusConditionType("Failing") + // ConditionalUpdateConditionTypeRecommended is a type of the condition present on a conditional update + // that indicates whether the conditional update is recommended or not + ConditionalUpdateConditionTypeRecommended = "Recommended" + // MaxHistory is the maximum size of ClusterVersion history. Once exceeded // ClusterVersion history will be pruned. It is declared here and passed // into the pruner function to allow easier testing. From f78049813f5ae052a549aadfe78786f4152b8f79 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Wed, 27 Sep 2023 16:38:53 +0200 Subject: [PATCH 2/5] Addressed review feedback - Used `max by` in the alert to drop irrelevant labels from alert - Replaced further `"Recommended"` literals with new constant - Do not `ToLower` booleanish string labels - Report metrics labelled with `condition` and `status` instead of a specialized `recommended` label. We only export the `Recommended` condition status still. - Naming and code structure tweaks --- ...ster-version-operator_02_servicemonitor.yaml | 3 +-- pkg/cvo/availableupdates.go | 6 +++--- pkg/cvo/metrics.go | 17 ++++++++--------- pkg/cvo/metrics_test.go | 14 +++++++------- pkg/cvo/status.go | 5 +++++ 5 files changed, 24 insertions(+), 21 deletions(-) diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml index ec4c80230..e572e04e7 100644 --- a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml +++ b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml @@ -132,7 +132,6 @@ spec: summary: Cluster Version Operator cannot evaluate conditional update matches for {{ "{{ $value | humanizeDuration }}" }}. description: Failure to evaluate conditional update matches means that Cluster Version Operator cannot decide whether an update path is recommended or not. expr: | - (cluster_version_conditional_updates_recommended_conditions_seconds{recommended="unknown"}) >= 3000 - for: 10m + max by (version, condition, status, reason) (cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} >= 3600) labels: severity: warning diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index e6af39383..4262bebce 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -59,7 +59,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 } else if upstream == optrAvailableUpdates.Upstream || (upstream == optr.defaultUpstreamServer && optrAvailableUpdates.Upstream == "") { needsConditionalUpdateEval := false for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates { - if recommended := meta.FindStatusCondition(conditionalUpdate.Conditions, "Recommended"); recommended == nil { + if recommended := findRecommendedCondition(conditionalUpdate.Conditions); recommended == nil { needsConditionalUpdateEval = true break } else if recommended.Status != metav1.ConditionTrue && recommended.Status != metav1.ConditionFalse { @@ -120,7 +120,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 queueKey := optr.queueKey() for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates { - if recommended := meta.FindStatusCondition(conditionalUpdate.Conditions, "Recommended"); recommended == nil { + if recommended := findRecommendedCondition(conditionalUpdate.Conditions); recommended == nil { klog.Warningf("Requeue available-update evaluation, because %q lacks a Recommended condition", conditionalUpdate.Release.Version) optr.availableUpdatesQueue.AddAfter(queueKey, time.Second) break @@ -389,7 +389,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { u.removeUpdate(conditionalUpdate.Release.Image) } else { meta.SetStatusCondition(&conditionalUpdate.Conditions, metav1.Condition{ - Type: "Recommended", + Type: ConditionalUpdateConditionTypeRecommended, Status: metav1.ConditionTrue, // FIXME: ObservedGeneration? That would capture upstream/channel, but not necessarily the currently-reconciling version. Reason: "AsExpected", diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index bd04f0baa..3d4e194ae 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -12,7 +12,6 @@ import ( "net/http" "os" "path/filepath" - "strings" "time" "github.com/prometheus/client_golang/prometheus" @@ -57,7 +56,7 @@ type operatorMetrics struct { clusterOperatorConditionTransitions *prometheus.GaugeVec clusterInstaller *prometheus.GaugeVec clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec - clusterVersionConditionalUpdatesRecommendedConditions *prometheus.GaugeVec + clusterVersionConditionalUpdateConditionSeconds *prometheus.GaugeVec // nowFunc is used to override the time.Now() function for testing. nowFunc func() time.Time @@ -120,10 +119,10 @@ penultimate completed version for 'completed'. Name: "cluster_version_operator_update_retrieval_timestamp_seconds", Help: "Reports when updates were last successfully retrieved.", }, []string{"name"}), - clusterVersionConditionalUpdatesRecommendedConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cluster_version_conditional_updates_recommended_conditions_seconds", - Help: "Reports the unknown conditional updates conditions", - }, []string{"version", "recommended", "reason"}), + clusterVersionConditionalUpdateConditionSeconds: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_version_conditional_update_condition_seconds", + Help: "Reports when condition on conditional updates were last updated", + }, []string{"version", "condition", "status", "reason"}), } } @@ -356,7 +355,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc() - ch <- m.clusterVersionConditionalUpdatesRecommendedConditions.WithLabelValues("", "", "").Desc() + ch <- m.clusterVersionConditionalUpdateConditionSeconds.WithLabelValues("", "", "", "").Desc() } func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, updates []configv1.ConditionalUpdate) { @@ -366,8 +365,8 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, continue } - g := m.clusterVersionConditionalUpdatesRecommendedConditions - gauge := g.WithLabelValues(update.Release.Version, strings.ToLower(string(condition.Status)), condition.Reason) + g := m.clusterVersionConditionalUpdateConditionSeconds + gauge := g.WithLabelValues(update.Release.Version, condition.Type, string(condition.Status), condition.Reason) gauge.Set(m.nowFunc().Sub(condition.LastTransitionTime.Time).Seconds()) ch <- gauge } diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index 4758df88a..28539cbb8 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -58,7 +58,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { } // optr.releaseCreated is epoch+2s, LastTransitionTime is epoch+3s // we mock the evaluation time to be one minute after optr.releaseCreated => expect 59s - expectMetric(t, metrics[2], 59, map[string]string{"reason": "RiskDoesNotApply", "recommended": "true", "version": "4.5.6"}) + expectMetric(t, metrics[2], 59, map[string]string{"condition": "Recommended", "reason": "RiskDoesNotApply", "status": "True", "version": "4.5.6"}) }, }, { @@ -820,7 +820,7 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { evaluate: anchorTime.Add(time.Minute), expected: []valueWithLabels{{ value: 60, - labels: map[string]string{"version": "4.13.1", "recommended": "false", "reason": "RiskApplies"}, + labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "False", "reason": "RiskApplies"}, }}, }, { @@ -841,7 +841,7 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { evaluate: anchorTime.Add(time.Minute), expected: []valueWithLabels{{ value: 60, - labels: map[string]string{"version": "4.13.1", "recommended": "true", "reason": "RiskDoesNotApply"}, + labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "True", "reason": "RiskDoesNotApply"}, }}, }, { @@ -862,7 +862,7 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { evaluate: anchorTime.Add(time.Minute), expected: []valueWithLabels{{ value: 60, - labels: map[string]string{"version": "4.13.1", "recommended": "unknown", "reason": "EvaluationFailed"}, + labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "Unknown", "reason": "EvaluationFailed"}, }}, }, { @@ -906,15 +906,15 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { expected: []valueWithLabels{ { value: 5 * 60, - labels: map[string]string{"version": "4.13.1", "recommended": "false", "reason": "RiskApplies"}, + labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "False", "reason": "RiskApplies"}, }, { value: 4 * 60, - labels: map[string]string{"version": "4.13.2", "recommended": "true", "reason": "RiskDoesNotApply"}, + labels: map[string]string{"version": "4.13.2", "condition": "Recommended", "status": "True", "reason": "RiskDoesNotApply"}, }, { value: 3 * 60, - labels: map[string]string{"version": "4.13.3", "recommended": "unknown", "reason": "EvaluationFailed"}, + labels: map[string]string{"version": "4.13.3", "condition": "Recommended", "status": "Unknown", "reason": "EvaluationFailed"}, }, }, }, diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 3e2d9c64b..37e942788 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -14,6 +14,7 @@ import ( "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/diff" "k8s.io/apimachinery/pkg/util/validation/field" @@ -41,6 +42,10 @@ const ( MaxHistory = 100 ) +func findRecommendedCondition(conditions []metav1.Condition) *metav1.Condition { + return meta.FindStatusCondition(conditions, ConditionalUpdateConditionTypeRecommended) +} + func mergeEqualVersions(current *configv1.UpdateHistory, desired configv1.Release) bool { if len(desired.Image) > 0 && desired.Image == current.Image { if len(desired.Version) == 0 { From 7225971ec619bc999735e46b2a912ab44ed16690 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Mon, 9 Oct 2023 20:11:25 +0200 Subject: [PATCH 3/5] Improve the help message for `cluster_version_conditional_update_condition_seconds` --- pkg/cvo/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 3d4e194ae..f183e878d 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -121,7 +121,7 @@ penultimate completed version for 'completed'. }, []string{"name"}), clusterVersionConditionalUpdateConditionSeconds: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_version_conditional_update_condition_seconds", - Help: "Reports when condition on conditional updates were last updated", + Help: "Reports the duration (in seconds) since the Recommended condition status on a conditional update changed to its current state.", }, []string{"version", "condition", "status", "reason"}), } } From 33be59508df5d7dfc8cad55950424a48cf551329 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Fri, 20 Oct 2023 18:02:49 +0200 Subject: [PATCH 4/5] metrics: pivot `..._conditional_update_condition_seconds` from duration to ts --- ...er-version-operator_02_servicemonitor.yaml | 7 +++++- pkg/cvo/metrics.go | 14 ++++------- pkg/cvo/metrics_test.go | 24 ++++++------------- 3 files changed, 17 insertions(+), 28 deletions(-) diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml index e572e04e7..44a5a6c3f 100644 --- a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml +++ b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml @@ -132,6 +132,11 @@ spec: summary: Cluster Version Operator cannot evaluate conditional update matches for {{ "{{ $value | humanizeDuration }}" }}. description: Failure to evaluate conditional update matches means that Cluster Version Operator cannot decide whether an update path is recommended or not. expr: | - max by (version, condition, status, reason) (cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} >= 3600) + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) labels: severity: warning diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index f183e878d..c021eedc3 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -57,15 +57,11 @@ type operatorMetrics struct { clusterInstaller *prometheus.GaugeVec clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec clusterVersionConditionalUpdateConditionSeconds *prometheus.GaugeVec - - // nowFunc is used to override the time.Now() function for testing. - nowFunc func() time.Time } func newOperatorMetrics(optr *Operator) *operatorMetrics { return &operatorMetrics{ - optr: optr, - nowFunc: time.Now, + optr: optr, conditionTransitions: make(map[conditionKey]int), @@ -121,7 +117,7 @@ penultimate completed version for 'completed'. }, []string{"name"}), clusterVersionConditionalUpdateConditionSeconds: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_version_conditional_update_condition_seconds", - Help: "Reports the duration (in seconds) since the Recommended condition status on a conditional update changed to its current state.", + Help: "Reports when the Recommended condition status on a conditional update changed to its current state.", }, []string{"version", "condition", "status", "reason"}), } } @@ -367,15 +363,13 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, g := m.clusterVersionConditionalUpdateConditionSeconds gauge := g.WithLabelValues(update.Release.Version, condition.Type, string(condition.Status), condition.Reason) - gauge.Set(m.nowFunc().Sub(condition.LastTransitionTime.Time).Seconds()) + gauge.Set(float64(condition.LastTransitionTime.Unix())) ch <- gauge } } } -// Collect collects metrics from the operator into the channel ch. Some metrics -// are taken relative to the when parameter, which should be the time at which -// the metrics were collected. +// Collect collects metrics from the operator into the channel ch func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { current := m.optr.currentVersion() var completed configv1.UpdateHistory diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index 28539cbb8..fe56cdcce 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -56,9 +56,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { if len(metrics) != 5 { t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) } - // optr.releaseCreated is epoch+2s, LastTransitionTime is epoch+3s - // we mock the evaluation time to be one minute after optr.releaseCreated => expect 59s - expectMetric(t, metrics[2], 59, map[string]string{"condition": "Recommended", "reason": "RiskDoesNotApply", "status": "True", "version": "4.5.6"}) + expectMetric(t, metrics[2], 3, map[string]string{"condition": "Recommended", "reason": "RiskDoesNotApply", "status": "True", "version": "4.5.6"}) }, }, { @@ -648,7 +646,6 @@ func Test_operatorMetrics_Collect(t *testing.T) { tt.optr.cmConfigLister = &cmConfigLister{} } m := newOperatorMetrics(tt.optr) - m.nowFunc = func() time.Time { return tt.optr.releaseCreated.Add(time.Minute) } descCh := make(chan *prometheus.Desc) go func() { for range descCh { @@ -776,13 +773,11 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { testCases := []struct { name string updates []configv1.ConditionalUpdate - evaluate time.Time expected []valueWithLabels }{ { name: "no conditional updates", updates: []configv1.ConditionalUpdate{}, - evaluate: anchorTime.Add(time.Minute), expected: []valueWithLabels{}, }, { @@ -817,9 +812,8 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { }}, }, }, - evaluate: anchorTime.Add(time.Minute), expected: []valueWithLabels{{ - value: 60, + value: float64(anchorTime.Unix()), labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "False", "reason": "RiskApplies"}, }}, }, @@ -838,9 +832,8 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { }}, }, }, - evaluate: anchorTime.Add(time.Minute), expected: []valueWithLabels{{ - value: 60, + value: float64(anchorTime.Unix()), labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "True", "reason": "RiskDoesNotApply"}, }}, }, @@ -859,9 +852,8 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { }}, }, }, - evaluate: anchorTime.Add(time.Minute), expected: []valueWithLabels{{ - value: 60, + value: float64(anchorTime.Unix()), labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "Unknown", "reason": "EvaluationFailed"}, }}, }, @@ -902,18 +894,17 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { }}, }, }, - evaluate: anchorTime.Add(5 * time.Minute), expected: []valueWithLabels{ { - value: 5 * 60, + value: float64(anchorTime.Unix()), labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "False", "reason": "RiskApplies"}, }, { - value: 4 * 60, + value: float64(anchorTime.Unix() + 60), labels: map[string]string{"version": "4.13.2", "condition": "Recommended", "status": "True", "reason": "RiskDoesNotApply"}, }, { - value: 3 * 60, + value: float64(anchorTime.Unix() + 120), labels: map[string]string{"version": "4.13.3", "condition": "Recommended", "status": "Unknown", "reason": "EvaluationFailed"}, }, }, @@ -925,7 +916,6 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { t.Run(tc.name, func(t *testing.T) { optr := &Operator{} m := newOperatorMetrics(optr) - m.nowFunc = func() time.Time { return tc.evaluate } ch := make(chan prometheus.Metric) go func() { From 983b3d01916d18368a1ce842321d6a913387e055 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Mon, 23 Oct 2023 20:42:02 +0200 Subject: [PATCH 5/5] availableupdates: populate internal state from CV status on startup If the CVO is just starting up, it should populate its "known state" of available updates from the `ClusterVersion` status, if its contains them. Previous CVO may have evaluated the same graph data like the the current one is about to do and if it did, there are likely existing conditions in the status that we need to respect (for example, do not bump a `lastTransitionTime` field on a condition on a conditinal update that was already evaluated with the same result. --- pkg/cvo/availableupdates.go | 8 ++++- pkg/cvo/availableupdates_test.go | 62 +++++++++++++++++++++++++------- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 4262bebce..32734da5f 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -50,6 +50,12 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 if optrAvailableUpdates == nil { klog.V(2).Info("First attempt to retrieve available updates") optrAvailableUpdates = &availableUpdates{} + // Populate known conditional updates from CV status, if present. They will be re-fetched later, + // but we need to populate existing conditions to avoid bumping lastTransitionTime fields on + // conditions if their status hasn't changed since previous CVO evaluated them. + for i := range config.Status.ConditionalUpdates { + optrAvailableUpdates.ConditionalUpdates = append(optrAvailableUpdates.ConditionalUpdates, *config.Status.ConditionalUpdates[i].DeepCopy()) + } } else if !optrAvailableUpdates.RecentlyChanged(optr.minimumUpdateCheckInterval) { klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) } else if channel != optrAvailableUpdates.Channel { @@ -382,7 +388,6 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { vj := semver.MustParse(u.ConditionalUpdates[j].Release.Version) return vi.GTE(vj) }) - for i, conditionalUpdate := range u.ConditionalUpdates { if errorCondition := evaluateConditionalUpdate(ctx, &conditionalUpdate, u.ConditionRegistry); errorCondition != nil { meta.SetStatusCondition(&conditionalUpdate.Conditions, *errorCondition) @@ -398,6 +403,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { u.addUpdate(conditionalUpdate.Release) } u.ConditionalUpdates[i].Conditions = conditionalUpdate.Conditions + } } diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go index 117ec2161..65b55dfe3 100644 --- a/pkg/cvo/availableupdates_test.go +++ b/pkg/cvo/availableupdates_test.go @@ -92,10 +92,11 @@ func osusWithSingleConditionalEdge() (*httptest.Server, clusterconditions.Condit }, Conditions: []metav1.Condition{ { - Type: "Recommended", - Status: metav1.ConditionFalse, - Reason: "FourFiveSix", - Message: "Four Five Five is just fine https://example.com/" + to, + Type: "Recommended", + Status: metav1.ConditionFalse, + Reason: "FourFiveSix", + Message: "Four Five Five is just fine https://example.com/" + to, + LastTransitionTime: metav1.Now(), }, }, }, @@ -167,27 +168,60 @@ func TestSyncAvailableUpdates(t *testing.T) { func TestSyncAvailableUpdates_ConditionalUpdateRecommendedConditions(t *testing.T) { testCases := []struct { name string - modifyOriginalState func(condition *metav1.Condition) + modifyOriginalState func(optr *Operator) + modifyCV func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) expectTimeChange bool }{ { name: "lastTransitionTime is not updated when nothing changes", - modifyOriginalState: func(condition *metav1.Condition) {}, + modifyOriginalState: func(optr *Operator) {}, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {}, }, { name: "lastTransitionTime is not updated when changed but status is identical", - modifyOriginalState: func(condition *metav1.Condition) { - condition.Reason = "OldReason" - condition.Message = "This message should be changed to something else" + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Reason = "OldReason" + optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Message = "This message should be changed to something else" }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {}, }, { name: "lastTransitionTime is updated when status changes", - modifyOriginalState: func(condition *metav1.Condition) { - condition.Status = metav1.ConditionUnknown + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Status = metav1.ConditionUnknown }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {}, expectTimeChange: true, }, + { + name: "lastTransitionTime is updated on first fetch with empty CV status", + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates = nil + }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {}, + expectTimeChange: true, + }, + { + name: "lastTransitionTime is updated on first fetch when condition status in CV status differs from fetched status", + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates = nil + }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) { + cv.Status.ConditionalUpdates = []configv1.ConditionalUpdate{*update.DeepCopy()} + cv.Status.ConditionalUpdates[0].Conditions[0].Status = metav1.ConditionUnknown + }, + expectTimeChange: true, + }, + { + name: "lastTransitionTime is not updated on first fetch when condition status in CV status matches fetched status", + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates = nil + }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) { + cv.Status.ConditionalUpdates = []configv1.ConditionalUpdate{*update.DeepCopy()} + }, + expectTimeChange: false, + }, } for _, tc := range testCases { @@ -199,9 +233,11 @@ func TestSyncAvailableUpdates_ConditionalUpdateRecommendedConditions(t *testing. optr.availableUpdates.ConditionalUpdates = conditionalUpdates expectedConditions := []metav1.Condition{{}} conditionalUpdates[0].Conditions[0].DeepCopyInto(&expectedConditions[0]) - tc.modifyOriginalState(&optr.availableUpdates.ConditionalUpdates[0].Conditions[0]) + cv := cvFixture.DeepCopy() + tc.modifyOriginalState(optr) + tc.modifyCV(cv, conditionalUpdates[0]) - err := optr.syncAvailableUpdates(context.Background(), cvFixture) + err := optr.syncAvailableUpdates(context.Background(), cv) if err != nil { t.Fatalf("syncAvailableUpdates() unexpected error: %v", err)