diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml index 4878257fb..44a5a6c3f 100644 --- a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml +++ b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml @@ -127,3 +127,16 @@ spec: for: 10m labels: severity: warning + - alert: CannotEvaluateConditionalUpdates + annotations: + summary: Cluster Version Operator cannot evaluate conditional update matches for {{ "{{ $value | humanizeDuration }}" }}. + description: Failure to evaluate conditional update matches means that Cluster Version Operator cannot decide whether an update path is recommended or not. + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 90495c4cd..32734da5f 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -50,6 +50,12 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 if optrAvailableUpdates == nil { klog.V(2).Info("First attempt to retrieve available updates") optrAvailableUpdates = &availableUpdates{} + // Populate known conditional updates from CV status, if present. They will be re-fetched later, + // but we need to populate existing conditions to avoid bumping lastTransitionTime fields on + // conditions if their status hasn't changed since previous CVO evaluated them. + for i := range config.Status.ConditionalUpdates { + optrAvailableUpdates.ConditionalUpdates = append(optrAvailableUpdates.ConditionalUpdates, *config.Status.ConditionalUpdates[i].DeepCopy()) + } } else if !optrAvailableUpdates.RecentlyChanged(optr.minimumUpdateCheckInterval) { klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) } else if channel != optrAvailableUpdates.Channel { @@ -59,7 +65,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 } else if upstream == optrAvailableUpdates.Upstream || (upstream == optr.defaultUpstreamServer && optrAvailableUpdates.Upstream == "") { needsConditionalUpdateEval := false for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates { - if recommended := meta.FindStatusCondition(conditionalUpdate.Conditions, "Recommended"); recommended == nil { + if recommended := findRecommendedCondition(conditionalUpdate.Conditions); recommended == nil { needsConditionalUpdateEval = true break } else if recommended.Status != metav1.ConditionTrue && recommended.Status != metav1.ConditionFalse { @@ -120,7 +126,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 queueKey := optr.queueKey() for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates { - if recommended := meta.FindStatusCondition(conditionalUpdate.Conditions, "Recommended"); recommended == nil { + if recommended := findRecommendedCondition(conditionalUpdate.Conditions); recommended == nil { klog.Warningf("Requeue available-update evaluation, because %q lacks a Recommended condition", conditionalUpdate.Release.Version) optr.availableUpdatesQueue.AddAfter(queueKey, time.Second) break @@ -382,14 +388,13 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { vj := semver.MustParse(u.ConditionalUpdates[j].Release.Version) return vi.GTE(vj) }) - for i, conditionalUpdate := range u.ConditionalUpdates { if errorCondition := evaluateConditionalUpdate(ctx, &conditionalUpdate, u.ConditionRegistry); errorCondition != nil { meta.SetStatusCondition(&conditionalUpdate.Conditions, *errorCondition) u.removeUpdate(conditionalUpdate.Release.Image) } else { meta.SetStatusCondition(&conditionalUpdate.Conditions, metav1.Condition{ - Type: "Recommended", + Type: ConditionalUpdateConditionTypeRecommended, Status: metav1.ConditionTrue, // FIXME: ObservedGeneration? That would capture upstream/channel, but not necessarily the currently-reconciling version. Reason: "AsExpected", @@ -398,6 +403,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { u.addUpdate(conditionalUpdate.Release) } u.ConditionalUpdates[i].Conditions = conditionalUpdate.Conditions + } } @@ -428,7 +434,7 @@ func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) stri func evaluateConditionalUpdate(ctx context.Context, conditionalUpdate *configv1.ConditionalUpdate, conditionRegistry clusterconditions.ConditionRegistry) *metav1.Condition { recommended := &metav1.Condition{ - Type: "Recommended", + Type: ConditionalUpdateConditionTypeRecommended, } messages := []string{} for _, risk := range conditionalUpdate.Risks { diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go index 117ec2161..65b55dfe3 100644 --- a/pkg/cvo/availableupdates_test.go +++ b/pkg/cvo/availableupdates_test.go @@ -92,10 +92,11 @@ func osusWithSingleConditionalEdge() (*httptest.Server, clusterconditions.Condit }, Conditions: []metav1.Condition{ { - Type: "Recommended", - Status: metav1.ConditionFalse, - Reason: "FourFiveSix", - Message: "Four Five Five is just fine https://example.com/" + to, + Type: "Recommended", + Status: metav1.ConditionFalse, + Reason: "FourFiveSix", + Message: "Four Five Five is just fine https://example.com/" + to, + LastTransitionTime: metav1.Now(), }, }, }, @@ -167,27 +168,60 @@ func TestSyncAvailableUpdates(t *testing.T) { func TestSyncAvailableUpdates_ConditionalUpdateRecommendedConditions(t *testing.T) { testCases := []struct { name string - modifyOriginalState func(condition *metav1.Condition) + modifyOriginalState func(optr *Operator) + modifyCV func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) expectTimeChange bool }{ { name: "lastTransitionTime is not updated when nothing changes", - modifyOriginalState: func(condition *metav1.Condition) {}, + modifyOriginalState: func(optr *Operator) {}, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {}, }, { name: "lastTransitionTime is not updated when changed but status is identical", - modifyOriginalState: func(condition *metav1.Condition) { - condition.Reason = "OldReason" - condition.Message = "This message should be changed to something else" + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Reason = "OldReason" + optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Message = "This message should be changed to something else" }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {}, }, { name: "lastTransitionTime is updated when status changes", - modifyOriginalState: func(condition *metav1.Condition) { - condition.Status = metav1.ConditionUnknown + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Status = metav1.ConditionUnknown }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {}, expectTimeChange: true, }, + { + name: "lastTransitionTime is updated on first fetch with empty CV status", + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates = nil + }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {}, + expectTimeChange: true, + }, + { + name: "lastTransitionTime is updated on first fetch when condition status in CV status differs from fetched status", + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates = nil + }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) { + cv.Status.ConditionalUpdates = []configv1.ConditionalUpdate{*update.DeepCopy()} + cv.Status.ConditionalUpdates[0].Conditions[0].Status = metav1.ConditionUnknown + }, + expectTimeChange: true, + }, + { + name: "lastTransitionTime is not updated on first fetch when condition status in CV status matches fetched status", + modifyOriginalState: func(optr *Operator) { + optr.availableUpdates = nil + }, + modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) { + cv.Status.ConditionalUpdates = []configv1.ConditionalUpdate{*update.DeepCopy()} + }, + expectTimeChange: false, + }, } for _, tc := range testCases { @@ -199,9 +233,11 @@ func TestSyncAvailableUpdates_ConditionalUpdateRecommendedConditions(t *testing. optr.availableUpdates.ConditionalUpdates = conditionalUpdates expectedConditions := []metav1.Condition{{}} conditionalUpdates[0].Conditions[0].DeepCopyInto(&expectedConditions[0]) - tc.modifyOriginalState(&optr.availableUpdates.ConditionalUpdates[0].Conditions[0]) + cv := cvFixture.DeepCopy() + tc.modifyOriginalState(optr) + tc.modifyCV(cv, conditionalUpdates[0]) - err := optr.syncAvailableUpdates(context.Background(), cvFixture) + err := optr.syncAvailableUpdates(context.Background(), cv) if err != nil { t.Fatalf("syncAvailableUpdates() unexpected error: %v", err) diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 2583c315e..c021eedc3 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -56,6 +56,7 @@ type operatorMetrics struct { clusterOperatorConditionTransitions *prometheus.GaugeVec clusterInstaller *prometheus.GaugeVec clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec + clusterVersionConditionalUpdateConditionSeconds *prometheus.GaugeVec } func newOperatorMetrics(optr *Operator) *operatorMetrics { @@ -114,6 +115,10 @@ penultimate completed version for 'completed'. Name: "cluster_version_operator_update_retrieval_timestamp_seconds", Help: "Reports when updates were last successfully retrieved.", }, []string{"name"}), + clusterVersionConditionalUpdateConditionSeconds: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_version_conditional_update_condition_seconds", + Help: "Reports when the Recommended condition status on a conditional update changed to its current state.", + }, []string{"version", "condition", "status", "reason"}), } } @@ -346,8 +351,25 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc() + ch <- m.clusterVersionConditionalUpdateConditionSeconds.WithLabelValues("", "", "", "").Desc() } +func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, updates []configv1.ConditionalUpdate) { + for _, update := range updates { + for _, condition := range update.Conditions { + if condition.Type != ConditionalUpdateConditionTypeRecommended { + continue + } + + g := m.clusterVersionConditionalUpdateConditionSeconds + gauge := g.WithLabelValues(update.Release.Version, condition.Type, string(condition.Status), condition.Reason) + gauge.Set(float64(condition.LastTransitionTime.Unix())) + ch <- gauge + } + } +} + +// Collect collects metrics from the operator into the channel ch func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { current := m.optr.currentVersion() var completed configv1.UpdateHistory @@ -468,6 +490,8 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { } ch <- g } + + m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates) } g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version) diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index 2bab3c591..fe56cdcce 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -23,6 +23,42 @@ func Test_operatorMetrics_Collect(t *testing.T) { optr *Operator wants func(*testing.T, []prometheus.Metric) }{ + { + name: "collect conditional update recommendations", + optr: &Operator{ + name: "test", + releaseCreated: time.Unix(2, 0), + cvLister: &cvLister{ + Items: []*configv1.ClusterVersion{ + { + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Status: configv1.ClusterVersionStatus{ + ConditionalUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.5.6", Image: "pullspec/4.5.6"}, + Conditions: []metav1.Condition{ + { + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Unix(3, 0)), + Reason: "RiskDoesNotApply", + Message: "Risk is not risky so you do not risk", + }, + }, + }, + }, + }, + }, + }, + }, + }, + wants: func(t *testing.T, metrics []prometheus.Metric) { + if len(metrics) != 5 { + t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) + } + expectMetric(t, metrics[2], 3, map[string]string{"condition": "Recommended", "reason": "RiskDoesNotApply", "status": "True", "version": "4.5.6"}) + }, + }, { name: "collects current version", optr: &Operator{ @@ -727,6 +763,182 @@ func Test_operatorMetrics_CollectTransitions(t *testing.T) { } } +func TestCollectUnknownConditionalUpdates(t *testing.T) { + anchorTime := time.Now() + + type valueWithLabels struct { + value float64 + labels map[string]string + } + testCases := []struct { + name string + updates []configv1.ConditionalUpdate + expected []valueWithLabels + }{ + { + name: "no conditional updates", + updates: []configv1.ConditionalUpdate{}, + expected: []valueWithLabels{}, + }, + { + name: "no recommended conditions on updates", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: "SomethingElseThanRecommended", + Status: "Unknown", + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "NoIdea", + Message: "No Idea", + }}, + }, + }, + expected: []valueWithLabels{}, + }, + { + name: "recommended false", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "RiskApplies", + Message: "Risk is risky so do not risk it", + }}, + }, + }, + expected: []valueWithLabels{{ + value: float64(anchorTime.Unix()), + labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "False", "reason": "RiskApplies"}, + }}, + }, + { + name: "recommended true", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "RiskDoesNotApply", + Message: "Risk is not risky so you do not risk", + }}, + }, + }, + expected: []valueWithLabels{{ + value: float64(anchorTime.Unix()), + labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "True", "reason": "RiskDoesNotApply"}, + }}, + }, + { + name: "recommended unknown", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "EvaluationFailed", + Message: "No idea sorry", + }}, + }, + }, + expected: []valueWithLabels{{ + value: float64(anchorTime.Unix()), + labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "Unknown", "reason": "EvaluationFailed"}, + }}, + }, + { + name: "multiple versions with different transition times", + updates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: "4.13.1", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.NewTime(anchorTime), + Reason: "RiskApplies", + Message: "Risk is risky so do not risk it", + }}, + }, + { + Release: configv1.Release{Version: "4.13.2", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.NewTime(anchorTime.Add(time.Minute)), + Reason: "RiskDoesNotApply", + Message: "Risk is not risky", + }}, + }, + { + Release: configv1.Release{Version: "4.13.3", Image: "pullspec"}, + Risks: []configv1.ConditionalUpdateRisk{{Name: "Risk"}}, + Conditions: []metav1.Condition{{ + Type: ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(anchorTime.Add(time.Minute * 2)), + Reason: "EvaluationFailed", + Message: "Metrics stack was abducted by aliens", + }}, + }, + }, + expected: []valueWithLabels{ + { + value: float64(anchorTime.Unix()), + labels: map[string]string{"version": "4.13.1", "condition": "Recommended", "status": "False", "reason": "RiskApplies"}, + }, + { + value: float64(anchorTime.Unix() + 60), + labels: map[string]string{"version": "4.13.2", "condition": "Recommended", "status": "True", "reason": "RiskDoesNotApply"}, + }, + { + value: float64(anchorTime.Unix() + 120), + labels: map[string]string{"version": "4.13.3", "condition": "Recommended", "status": "Unknown", "reason": "EvaluationFailed"}, + }, + }, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + optr := &Operator{} + m := newOperatorMetrics(optr) + ch := make(chan prometheus.Metric) + + go func() { + m.collectConditionalUpdates(ch, tc.updates) + close(ch) + }() + + var collected []prometheus.Metric + for item := range ch { + collected = append(collected, item) + } + + if lenC, lenE := len(collected), len(tc.expected); lenC != lenE { + + t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected)) + } + for i := range tc.expected { + expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels) + } + }) + } +} + func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) { t.Helper() var d dto.Metric diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 76ccff25e..37e942788 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -14,6 +14,7 @@ import ( "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/diff" "k8s.io/apimachinery/pkg/util/validation/field" @@ -31,12 +32,20 @@ const ( // and indicates the cluster is not healthy. ClusterStatusFailing = configv1.ClusterStatusConditionType("Failing") + // ConditionalUpdateConditionTypeRecommended is a type of the condition present on a conditional update + // that indicates whether the conditional update is recommended or not + ConditionalUpdateConditionTypeRecommended = "Recommended" + // MaxHistory is the maximum size of ClusterVersion history. Once exceeded // ClusterVersion history will be pruned. It is declared here and passed // into the pruner function to allow easier testing. MaxHistory = 100 ) +func findRecommendedCondition(conditions []metav1.Condition) *metav1.Condition { + return meta.FindStatusCondition(conditions, ConditionalUpdateConditionTypeRecommended) +} + func mergeEqualVersions(current *configv1.UpdateHistory, desired configv1.Release) bool { if len(desired.Image) > 0 && desired.Image == current.Image { if len(desired.Version) == 0 {