Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions install/0000_90_cluster-version-operator_02_servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,16 @@ spec:
for: 10m
labels:
severity: warning
- alert: CannotEvaluateConditionalUpdates
annotations:
summary: Cluster Version Operator cannot evaluate conditional update matches for {{ "{{ $value | humanizeDuration }}" }}.
description: Failure to evaluate conditional update matches means that Cluster Version Operator cannot decide whether an update path is recommended or not.
expr: |
max by (version, condition, status, reason)
(
(
time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"}
) >= 3600
)
labels:
severity: warning
16 changes: 11 additions & 5 deletions pkg/cvo/availableupdates.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
if optrAvailableUpdates == nil {
klog.V(2).Info("First attempt to retrieve available updates")
optrAvailableUpdates = &availableUpdates{}
// Populate known conditional updates from CV status, if present. They will be re-fetched later,
// but we need to populate existing conditions to avoid bumping lastTransitionTime fields on
// conditions if their status hasn't changed since previous CVO evaluated them.
for i := range config.Status.ConditionalUpdates {
optrAvailableUpdates.ConditionalUpdates = append(optrAvailableUpdates.ConditionalUpdates, *config.Status.ConditionalUpdates[i].DeepCopy())
}
} else if !optrAvailableUpdates.RecentlyChanged(optr.minimumUpdateCheckInterval) {
klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
} else if channel != optrAvailableUpdates.Channel {
Expand All @@ -59,7 +65,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
} else if upstream == optrAvailableUpdates.Upstream || (upstream == optr.defaultUpstreamServer && optrAvailableUpdates.Upstream == "") {
needsConditionalUpdateEval := false
for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates {
if recommended := meta.FindStatusCondition(conditionalUpdate.Conditions, "Recommended"); recommended == nil {
if recommended := findRecommendedCondition(conditionalUpdate.Conditions); recommended == nil {
needsConditionalUpdateEval = true
break
} else if recommended.Status != metav1.ConditionTrue && recommended.Status != metav1.ConditionFalse {
Expand Down Expand Up @@ -120,7 +126,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1

queueKey := optr.queueKey()
for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates {
if recommended := meta.FindStatusCondition(conditionalUpdate.Conditions, "Recommended"); recommended == nil {
if recommended := findRecommendedCondition(conditionalUpdate.Conditions); recommended == nil {
klog.Warningf("Requeue available-update evaluation, because %q lacks a Recommended condition", conditionalUpdate.Release.Version)
optr.availableUpdatesQueue.AddAfter(queueKey, time.Second)
break
Expand Down Expand Up @@ -382,14 +388,13 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
vj := semver.MustParse(u.ConditionalUpdates[j].Release.Version)
return vi.GTE(vj)
})

for i, conditionalUpdate := range u.ConditionalUpdates {
if errorCondition := evaluateConditionalUpdate(ctx, &conditionalUpdate, u.ConditionRegistry); errorCondition != nil {
meta.SetStatusCondition(&conditionalUpdate.Conditions, *errorCondition)
u.removeUpdate(conditionalUpdate.Release.Image)
} else {
meta.SetStatusCondition(&conditionalUpdate.Conditions, metav1.Condition{
Type: "Recommended",
Type: ConditionalUpdateConditionTypeRecommended,
Status: metav1.ConditionTrue,
// FIXME: ObservedGeneration? That would capture upstream/channel, but not necessarily the currently-reconciling version.
Reason: "AsExpected",
Expand All @@ -398,6 +403,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
u.addUpdate(conditionalUpdate.Release)
}
u.ConditionalUpdates[i].Conditions = conditionalUpdate.Conditions

}
}

Expand Down Expand Up @@ -428,7 +434,7 @@ func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) stri

func evaluateConditionalUpdate(ctx context.Context, conditionalUpdate *configv1.ConditionalUpdate, conditionRegistry clusterconditions.ConditionRegistry) *metav1.Condition {
recommended := &metav1.Condition{
Type: "Recommended",
Type: ConditionalUpdateConditionTypeRecommended,
}
messages := []string{}
for _, risk := range conditionalUpdate.Risks {
Expand Down
62 changes: 49 additions & 13 deletions pkg/cvo/availableupdates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,11 @@ func osusWithSingleConditionalEdge() (*httptest.Server, clusterconditions.Condit
},
Conditions: []metav1.Condition{
{
Type: "Recommended",
Status: metav1.ConditionFalse,
Reason: "FourFiveSix",
Message: "Four Five Five is just fine https://example.com/" + to,
Type: "Recommended",
Status: metav1.ConditionFalse,
Reason: "FourFiveSix",
Message: "Four Five Five is just fine https://example.com/" + to,
LastTransitionTime: metav1.Now(),
},
},
},
Expand Down Expand Up @@ -167,27 +168,60 @@ func TestSyncAvailableUpdates(t *testing.T) {
func TestSyncAvailableUpdates_ConditionalUpdateRecommendedConditions(t *testing.T) {
testCases := []struct {
name string
modifyOriginalState func(condition *metav1.Condition)
modifyOriginalState func(optr *Operator)
modifyCV func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate)
expectTimeChange bool
}{
{
name: "lastTransitionTime is not updated when nothing changes",
modifyOriginalState: func(condition *metav1.Condition) {},
modifyOriginalState: func(optr *Operator) {},
modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {},
},
{
name: "lastTransitionTime is not updated when changed but status is identical",
modifyOriginalState: func(condition *metav1.Condition) {
condition.Reason = "OldReason"
condition.Message = "This message should be changed to something else"
modifyOriginalState: func(optr *Operator) {
optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Reason = "OldReason"
optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Message = "This message should be changed to something else"
},
modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {},
},
{
name: "lastTransitionTime is updated when status changes",
modifyOriginalState: func(condition *metav1.Condition) {
condition.Status = metav1.ConditionUnknown
modifyOriginalState: func(optr *Operator) {
optr.availableUpdates.ConditionalUpdates[0].Conditions[0].Status = metav1.ConditionUnknown
},
modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {},
expectTimeChange: true,
},
{
name: "lastTransitionTime is updated on first fetch with empty CV status",
modifyOriginalState: func(optr *Operator) {
optr.availableUpdates = nil
},
modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {},
expectTimeChange: true,
},
{
name: "lastTransitionTime is updated on first fetch when condition status in CV status differs from fetched status",
modifyOriginalState: func(optr *Operator) {
optr.availableUpdates = nil
},
modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {
cv.Status.ConditionalUpdates = []configv1.ConditionalUpdate{*update.DeepCopy()}
cv.Status.ConditionalUpdates[0].Conditions[0].Status = metav1.ConditionUnknown
},
expectTimeChange: true,
},
{
name: "lastTransitionTime is not updated on first fetch when condition status in CV status matches fetched status",
modifyOriginalState: func(optr *Operator) {
optr.availableUpdates = nil
},
modifyCV: func(cv *configv1.ClusterVersion, update configv1.ConditionalUpdate) {
cv.Status.ConditionalUpdates = []configv1.ConditionalUpdate{*update.DeepCopy()}
},
expectTimeChange: false,
},
}

for _, tc := range testCases {
Expand All @@ -199,9 +233,11 @@ func TestSyncAvailableUpdates_ConditionalUpdateRecommendedConditions(t *testing.
optr.availableUpdates.ConditionalUpdates = conditionalUpdates
expectedConditions := []metav1.Condition{{}}
conditionalUpdates[0].Conditions[0].DeepCopyInto(&expectedConditions[0])
tc.modifyOriginalState(&optr.availableUpdates.ConditionalUpdates[0].Conditions[0])
cv := cvFixture.DeepCopy()
tc.modifyOriginalState(optr)
tc.modifyCV(cv, conditionalUpdates[0])

err := optr.syncAvailableUpdates(context.Background(), cvFixture)
err := optr.syncAvailableUpdates(context.Background(), cv)

if err != nil {
t.Fatalf("syncAvailableUpdates() unexpected error: %v", err)
Expand Down
24 changes: 24 additions & 0 deletions pkg/cvo/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ type operatorMetrics struct {
clusterOperatorConditionTransitions *prometheus.GaugeVec
clusterInstaller *prometheus.GaugeVec
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
clusterVersionConditionalUpdateConditionSeconds *prometheus.GaugeVec
}

func newOperatorMetrics(optr *Operator) *operatorMetrics {
Expand Down Expand Up @@ -114,6 +115,10 @@ penultimate completed version for 'completed'.
Name: "cluster_version_operator_update_retrieval_timestamp_seconds",
Help: "Reports when updates were last successfully retrieved.",
}, []string{"name"}),
clusterVersionConditionalUpdateConditionSeconds: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_version_conditional_update_condition_seconds",
Help: "Reports when the Recommended condition status on a conditional update changed to its current state.",
}, []string{"version", "condition", "status", "reason"}),
}
}

Expand Down Expand Up @@ -346,8 +351,25 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
ch <- m.clusterVersionConditionalUpdateConditionSeconds.WithLabelValues("", "", "", "").Desc()
}

func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, updates []configv1.ConditionalUpdate) {
for _, update := range updates {
for _, condition := range update.Conditions {
if condition.Type != ConditionalUpdateConditionTypeRecommended {
continue
}

g := m.clusterVersionConditionalUpdateConditionSeconds
gauge := g.WithLabelValues(update.Release.Version, condition.Type, string(condition.Status), condition.Reason)
gauge.Set(float64(condition.LastTransitionTime.Unix()))
ch <- gauge
}
}
}

// Collect collects metrics from the operator into the channel ch
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: truncated comment

func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
current := m.optr.currentVersion()
var completed configv1.UpdateHistory
Expand Down Expand Up @@ -468,6 +490,8 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
}
ch <- g
}

m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
}

g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)
Expand Down
Loading