From 3e00af8ff73993735714ce62d94e9fae72b333bd Mon Sep 17 00:00:00 2001 From: Lalatendu Mohanty Date: Fri, 21 Oct 2022 15:14:33 -0400 Subject: [PATCH 01/21] OCPBUGS-2727: Do not fail precondition check for UnknownUpdate With this change we will not fail precondition checks when a version is not in available updates. So users should be able to upgrade to any version with approriate client side overrides. For example using --allow-explicit-upgrade and --to-imge flags in "oc adm upgrade" Signed-off-by: Lalatendu Mohanty --- .../clusterversion/recommendedupdate.go | 29 +++++++++++-------- pkg/payload/precondition/precondition.go | 17 +++++++---- pkg/payload/precondition/precondition_test.go | 4 +-- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/pkg/payload/precondition/clusterversion/recommendedupdate.go b/pkg/payload/precondition/clusterversion/recommendedupdate.go index 4821c21de2..8a07ae255c 100644 --- a/pkg/payload/precondition/clusterversion/recommendedupdate.go +++ b/pkg/payload/precondition/clusterversion/recommendedupdate.go @@ -37,10 +37,11 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio } if err != nil { return &precondition.Error{ - Nested: err, - Reason: "UnknownError", - Message: err.Error(), - Name: ru.Name(), + Nested: err, + Reason: "UnknownError", + Message: err.Error(), + Name: ru.Name(), + NonBlockingWarning: true, } } for _, recommended := range clusterVersion.Status.AvailableUpdates { @@ -61,7 +62,8 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio Reason: condition.Reason, Message: fmt.Sprintf("Update from %s to %s is not recommended:\n\n%s", clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion, condition.Message), - Name: ru.Name(), + Name: ru.Name(), + NonBlockingWarning: true, } default: return &precondition.Error{ @@ -69,7 +71,8 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio Message: fmt.Sprintf("Update from %s to %s is %s=%s: %s: %s", clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion, condition.Type, condition.Status, condition.Reason, condition.Message), - Name: ru.Name(), + Name: ru.Name(), + NonBlockingWarning: true, } } } @@ -78,7 +81,8 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio Reason: "UnknownConditionType", Message: fmt.Sprintf("Update from %s to %s has a status.conditionalUpdates entry, but no Recommended condition.", clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion), - Name: ru.Name(), + Name: ru.Name(), + NonBlockingWarning: true, } } } @@ -88,13 +92,13 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio Reason: "NoChannel", Message: fmt.Sprintf("Configured channel is unset, so the recommended status of updating from %s to %s is unknown.", clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion), - Name: ru.Name(), + Name: ru.Name(), + NonBlockingWarning: true, } } reason := "UnknownUpdate" msg := "" - if retrieved := resourcemerge.FindOperatorStatusCondition(clusterVersion.Status.Conditions, configv1.RetrievedUpdates); retrieved == nil { msg = fmt.Sprintf("No %s, so the recommended status of updating from %s to %s is unknown.", configv1.RetrievedUpdates, clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion) @@ -108,9 +112,10 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio if msg != "" { return &precondition.Error{ - Reason: reason, - Message: msg, - Name: ru.Name(), + Reason: reason, + Message: msg, + Name: ru.Name(), + NonBlockingWarning: true, } } return nil diff --git a/pkg/payload/precondition/precondition.go b/pkg/payload/precondition/precondition.go index 5daa1d47ec..f421ed44ed 100644 --- a/pkg/payload/precondition/precondition.go +++ b/pkg/payload/precondition/precondition.go @@ -12,10 +12,11 @@ import ( // Error is a wrapper for errors that occur during a precondition check for payload. type Error struct { - Nested error - Reason string - Message string - Name string + Nested error + Reason string + Message string + Name string + NonBlockingWarning bool // For some errors we do not want to fail the precondition check but we want to communicate about it } // Error returns the message @@ -71,12 +72,17 @@ func Summarize(errs []error, force bool) (bool, error) { return false, nil } var msgs []string + var isWarning = true for _, e := range errs { if pferr, ok := e.(*Error); ok { msgs = append(msgs, fmt.Sprintf("Precondition %q failed because of %q: %v", pferr.Name, pferr.Reason, pferr.Error())) + if !pferr.NonBlockingWarning { + isWarning = false + } continue } msgs = append(msgs, e.Error()) + } msg := "" if len(msgs) == 1 { @@ -87,9 +93,10 @@ func Summarize(errs []error, force bool) (bool, error) { if force { msg = fmt.Sprintf("Forced through blocking failures: %s", msg) + isWarning = true } - return !force, &payload.UpdateError{ + return !isWarning, &payload.UpdateError{ Nested: nil, Reason: "UpgradePreconditionCheckFailed", Message: msg, diff --git a/pkg/payload/precondition/precondition_test.go b/pkg/payload/precondition/precondition_test.go index 015a41cfd7..192a0a56ba 100644 --- a/pkg/payload/precondition/precondition_test.go +++ b/pkg/payload/precondition/precondition_test.go @@ -21,7 +21,7 @@ func TestSummarize(t *testing.T) { }, { name: "unrecognized error type", errors: []error{fmt.Errorf("random error")}, - expectedBlock: true, + expectedBlock: false, expectedError: "random error", }, { name: "forced unrecognized error type", @@ -42,7 +42,7 @@ func TestSummarize(t *testing.T) { }, { name: "two unrecognized error types", errors: []error{fmt.Errorf("random error"), fmt.Errorf("random error 2")}, - expectedBlock: true, + expectedBlock: false, expectedError: `Multiple precondition checks failed: * random error * random error 2`, From ff4a9f3663325002301e3a7fb3eb7590fcd0a022 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Thu, 13 Oct 2022 18:02:48 +0200 Subject: [PATCH 02/21] OCPBUGS-1458: Allow CVO to update `KUBERNETES_SERVICE_HOST` with LB address The problem was identified to be a broken substitution of internal load balancer into `KUBERNETES_SERVICE_HOST` by Trevor and David (see my [JIRA comment](https://issues.redhat.com/browse/OCPBUGS-1458?focusedCommentId=21090756&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-21090756) and related [Slack thread](https://coreos.slack.com/archives/C011CSSPBLK/p1664925995946479?thread_ts=1661182025.992649&cid=C011CSSPBLK)). CVO injects the LB hostname in the [`ModifyDeployment`](https://github.com/openshift/cluster-version-operator/blob/dc1ad0aef5f3e1b88074448d21445a5bddb6b05b/lib/resourcebuilder/apps.go#L19) fine, but then the deployment gets applied in [`ApplyDeployment`](https://github.com/openshift/cluster-version-operator/blob/dc1ad0aef5f3e1b88074448d21445a5bddb6b05b/lib/resourceapply/apps.go#L17) and the `EnsureDeployment`->`ensurePodTemplateSpec`->`ensurePodSpec`->`ensureContainers`->`ensureContainer`->`ensureEnvVar` chain stomps the updated value in `required` by the old value from `existing` and reverts the injection in this way This behavior was added intentionally in https://github.com/openshift/cluster-version-operator/pull/559 as a part of a fix for various hot-looping issues. The substitution apparently caused some hot-looping issues in the past ([slack thread](https://coreos.slack.com/archives/CEGKQ43CP/p1620934857402200?thread_ts=1620895567.367100&cid=CEGKQ43CP)). I have tested removing the special handling `KUBERNETES_SERVICE_HOST` thoroughly, and saw no problematic behavior. After fixing other hot-looping problems in https://github.com/openshift/cluster-version-operator/pull/855 to eliminate noise, no new hot-loops occurs with `KUBERNETES_SERVICE_HOST` handling removed. --- lib/resourcemerge/core.go | 15 --------------- lib/resourcemerge/core_test.go | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/lib/resourcemerge/core.go b/lib/resourcemerge/core.go index b6f4df7619..9f3288c6b6 100644 --- a/lib/resourcemerge/core.go +++ b/lib/resourcemerge/core.go @@ -112,13 +112,6 @@ func ensureContainer(modified *bool, existing *corev1.Container, required corev1 func ensureEnvVar(modified *bool, existing *[]corev1.EnvVar, required []corev1.EnvVar) { for envidx := range required { - // Currently only CVO deployment uses this variable to inject internal LB host. - // This may result in an IP address being returned by API so assuming the - // returned value is correct. - if required[envidx].Name == "KUBERNETES_SERVICE_HOST" { - ensureEnvVarKubeService(*existing, &required[envidx]) - } - if required[envidx].ValueFrom != nil { ensureEnvVarSourceFieldRefDefault(required[envidx].ValueFrom.FieldRef) } @@ -129,14 +122,6 @@ func ensureEnvVar(modified *bool, existing *[]corev1.EnvVar, required []corev1.E } } -func ensureEnvVarKubeService(existing []corev1.EnvVar, required *corev1.EnvVar) { - for envidx := range existing { - if existing[envidx].Name == required.Name { - required.Value = existing[envidx].Value - } - } -} - func ensureEnvVarSourceFieldRefDefault(required *corev1.ObjectFieldSelector) { if required != nil && required.APIVersion == "" { required.APIVersion = "v1" diff --git a/lib/resourcemerge/core_test.go b/lib/resourcemerge/core_test.go index 4ca8978549..29db68f3ef 100644 --- a/lib/resourcemerge/core_test.go +++ b/lib/resourcemerge/core_test.go @@ -1640,6 +1640,26 @@ func TestEnsureEnvVar(t *testing.T) { }, expectedModified: false, }, + { + name: "CVO can inject LB into ENVVAR", + existing: []corev1.EnvVar{ + {Name: "ENVVAR", Value: "127.0.0.1"}, + }, + input: []corev1.EnvVar{ + {Name: "ENVVAR", Value: "api-int.ci-ln-vqs15yk-72292.gcp-2.ci.openshift.org"}, + }, + expectedModified: true, + }, + { + name: "CVO can inject LB into KUBERNETES_SERVICE_HOST", + existing: []corev1.EnvVar{ + {Name: "KUBERNETES_SERVICE_HOST", Value: "127.0.0.1"}, + }, + input: []corev1.EnvVar{ + {Name: "KUBERNETES_SERVICE_HOST", Value: "api-int.ci-ln-vqs15yk-72292.gcp-2.ci.openshift.org"}, + }, + expectedModified: true, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { From 63c3c276312072dcff57732e8cc84beff78ce1e3 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Tue, 25 Oct 2022 15:18:12 +0200 Subject: [PATCH 03/21] Poll for 30s on network errors when getting featuresets The client-go code retries a subset of network errors on GET for 30s, but we saw occurrences of other short disruptions, like DNS ones, that make us abort and restart unnecessarily soon. Make CVO retry all errors for 25s and only abort when we do not succeed in this time frame. This helps CVO survive short disruptions on startup, leading to less noise, mostly during installation. --- pkg/start/start.go | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/pkg/start/start.go b/pkg/start/start.go index 40309b1d12..6785a83414 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -16,6 +16,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" @@ -133,19 +134,37 @@ func (o *Options) Run(ctx context.Context) error { // check to see if techpreview should be on or off. If we cannot read the featuregate for any reason, it is assumed // to be off. If this value changes, the binary will shutdown and expect the pod lifecycle to restart it. startingFeatureSet := "" - gate, err := cb.ClientOrDie("feature-gate-getter").ConfigV1().FeatureGates().Get(ctx, "cluster", metav1.GetOptions{}) - switch { - case apierrors.IsNotFound(err): - // if we have no featuregates, then the cluster is using the default featureset, which is "". - // This excludes everything that could possibly depend on a different feature set. - startingFeatureSet = "" - case err != nil: - // client-go automatically retries network blip errors on GETs for 30s by default. If we fail longer than that - // the operator won't be able to do work anyway. Return the error and crashloop. - return err - default: - startingFeatureSet = string(gate.Spec.FeatureSet) + // client-go automatically retries some network blip errors on GETs for 30s by default, and we want to + // retry the remaining ones ourselves. If we fail longer than that, the operator won't be able to do work + // anyway. Return the error and crashloop. + // + // We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave + // well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go + fgCtx, fgCancel := context.WithTimeout(ctx, 25*time.Second) + defer fgCancel() + var lastError error + if err := wait.PollImmediateInfiniteWithContext(fgCtx, 2*time.Second, func(ctx context.Context) (bool, error) { + gate, fgErr := cb.ClientOrDie("feature-gate-getter").ConfigV1().FeatureGates().Get(ctx, "cluster", metav1.GetOptions{}) + switch { + case apierrors.IsNotFound(fgErr): + // if we have no featuregates, then the cluster is using the default featureset, which is "". + // This excludes everything that could possibly depend on a different feature set. + startingFeatureSet = "" + return true, nil + case fgErr != nil: + lastError = fgErr + klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr) + return false, nil + default: + startingFeatureSet = string(gate.Spec.FeatureSet) + return true, nil + } + }); err != nil { + if lastError != nil { + return lastError + } + return err } lock, err := createResourceLock(cb, o.Namespace, o.Name) From d736264e29c995c6eb97cbac484a9991b2f5a95f Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 15 Dec 2022 19:42:24 -0800 Subject: [PATCH 04/21] pkg/payload/precondition: Do not claim warnings would have blocked Avoid errors like: $ oc get -o json clusterversion version | jq -r '.status.history[0].acceptedRisks' Forced through blocking failures: Precondition "ClusterVersionRecommendedUpdate" failed because of "UnknownUpdate": RetrievedUpdates=True (), so the update from 4.13.0-0.okd-2022-12-11-064650 to 4.13.0-0.okd-2022-12-13-052859 is probably neither recommended nor supported. Instead, tweak the logic from 481bcde393 (OCPBUGS-2727: Do not fail precondition check for UnknownUpdate, 2022-10-21, #856), and only append the "Forced through blocking failures:" prefix when the forcing was required. Also fail-closed and treat non *Error errors as blocking too, where previously only !pferr.NonBlockingWarning were blocking. --- pkg/payload/precondition/precondition.go | 3 ++- pkg/payload/precondition/precondition_test.go | 26 +++++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/pkg/payload/precondition/precondition.go b/pkg/payload/precondition/precondition.go index f421ed44ed..3725ad84aa 100644 --- a/pkg/payload/precondition/precondition.go +++ b/pkg/payload/precondition/precondition.go @@ -81,6 +81,7 @@ func Summarize(errs []error, force bool) (bool, error) { } continue } + isWarning = false msgs = append(msgs, e.Error()) } @@ -91,7 +92,7 @@ func Summarize(errs []error, force bool) (bool, error) { msg = fmt.Sprintf("Multiple precondition checks failed:\n* %s", strings.Join(msgs, "\n* ")) } - if force { + if force && !isWarning { msg = fmt.Sprintf("Forced through blocking failures: %s", msg) isWarning = true } diff --git a/pkg/payload/precondition/precondition_test.go b/pkg/payload/precondition/precondition_test.go index 192a0a56ba..581bf4c07b 100644 --- a/pkg/payload/precondition/precondition_test.go +++ b/pkg/payload/precondition/precondition_test.go @@ -21,7 +21,7 @@ func TestSummarize(t *testing.T) { }, { name: "unrecognized error type", errors: []error{fmt.Errorf("random error")}, - expectedBlock: false, + expectedBlock: true, expectedError: "random error", }, { name: "forced unrecognized error type", @@ -29,6 +29,28 @@ func TestSummarize(t *testing.T) { force: true, expectedBlock: false, expectedError: "Forced through blocking failures: random error", + }, { + name: "unforced warning", + errors: []error{&Error{ + Nested: nil, + Reason: "UnknownUpdate", + Message: "update from A to B is probably neither recommended nor supported.", + NonBlockingWarning: true, + Name: "ClusterVersionRecommendedUpdate", + }}, + expectedBlock: false, + expectedError: `Precondition "ClusterVersionRecommendedUpdate" failed because of "UnknownUpdate": update from A to B is probably neither recommended nor supported.`, + }, { + name: "forced through warning", + errors: []error{&Error{ + Nested: nil, + Reason: "UnknownUpdate", + Message: "update from A to B is probably neither recommended nor supported.", + NonBlockingWarning: true, + Name: "ClusterVersionRecommendedUpdate", + }}, + force: true, + expectedError: `Precondition "ClusterVersionRecommendedUpdate" failed because of "UnknownUpdate": update from A to B is probably neither recommended nor supported.`, }, { name: "single feature-gate error", errors: []error{&Error{ @@ -42,7 +64,7 @@ func TestSummarize(t *testing.T) { }, { name: "two unrecognized error types", errors: []error{fmt.Errorf("random error"), fmt.Errorf("random error 2")}, - expectedBlock: false, + expectedBlock: true, expectedError: `Multiple precondition checks failed: * random error * random error 2`, From bd5cc2aea27f8d8b813452a8bab13de4935c4e6c Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Mon, 9 Jan 2023 15:53:12 +0100 Subject: [PATCH 05/21] OCPBUGS-5505: Set upgradeability check throttling period to 2m Previously, the throttling reused the `minimumUpdateCheckInterval` value which is derived from the full CVO minimum sync period. This value is set between 2m and 4m at CVO startup. This period is unecessarily long and bad for UX, things happen with a delay and our own testcase expects upgradeability to be propagated in 3 minutes at worst. Hardcode the throttling to 2m (lower bound of previous behavior) to prevent flapping on flurries but allow changes to propagate deterministically faster. We will still get a bit of non-determinisim from sync periods and requeueing, so this change should not cause any periodic API-hammering. --- pkg/cvo/upgradeable.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/cvo/upgradeable.go b/pkg/cvo/upgradeable.go index d42b7f318c..4830c4a1a1 100644 --- a/pkg/cvo/upgradeable.go +++ b/pkg/cvo/upgradeable.go @@ -29,17 +29,18 @@ import ( const ( adminAckGateFmt string = "^ack-[4-5][.]([0-9]{1,})-[^-]" upgradeableAdminAckRequired = configv1.ClusterStatusConditionType("UpgradeableAdminAckRequired") + checkThrottlePeriod = 2 * time.Minute ) var adminAckGateRegexp = regexp.MustCompile(adminAckGateFmt) // syncUpgradeable synchronizes the upgradeable status only if it has been more than -// the minimumUpdateCheckInterval since the last synchronization or the precondition +// the checkThrottlePeriod since the last synchronization or the precondition // checks on the payload are failing for less than minimumUpdateCheckInterval, and it has // been more than the minimumUpgradeableCheckInterval since the last synchronization. func (optr *Operator) syncUpgradeable(config *configv1.ClusterVersion) error { u := optr.getUpgradeable() - if u != nil && u.RecentlyChanged(optr.minimumUpdateCheckInterval) && !shouldSyncUpgradeableDueToPreconditionChecks(optr, config, u) { + if u != nil && u.RecentlyChanged(checkThrottlePeriod) && !shouldSyncUpgradeableDueToPreconditionChecks(optr, config, u) { klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") return nil } From 2247cf2fdd2a8e0713c3afd4f087c58368fa810d Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Wed, 11 Jan 2023 20:49:47 +0100 Subject: [PATCH 06/21] pkg/cvo/upgradeable: refactor throttling Refactor the code that handles throttling upgradeability checks. Create a new method that computes the duration for which the existing `Upgradeable` status is considered recent enough to not be synced, and simply pass this duration to the `RecentlyChanged` method. The new method is now unit tested, too. Upgradeable-related intervals are now uncoupled to unrelated sync intervals and are grouped in a new struct. --- pkg/cvo/cvo.go | 17 ++++----- pkg/cvo/upgradeable.go | 76 ++++++++++++++++++++++++------------- pkg/cvo/upgradeable_test.go | 59 ++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 35 deletions(-) create mode 100644 pkg/cvo/upgradeable_test.go diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 82538be816..a70b73eed3 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -125,10 +125,9 @@ type Operator struct { upgradeable *upgradeable upgradeableChecks []upgradeableCheck - // minimumUpgradeableCheckInterval is the minimum duration before another - // synchronization of the upgradeable status can happen during failing - // precondition checks. - minimumUpgradeableCheckInterval time.Duration + // upgradeableCheckIntervals drives minimal intervals between Upgradeable status + // synchronization + upgradeableCheckIntervals upgradeableCheckIntervals // verifier, if provided, will be used to check an update before it is executed. // Any error will prevent an update payload from being accessed. @@ -188,11 +187,11 @@ func New( Image: releaseImage, }, - statusInterval: 15 * time.Second, - minimumUpdateCheckInterval: minimumInterval, - minimumUpgradeableCheckInterval: 15 * time.Second, - payloadDir: overridePayloadDir, - defaultUpstreamServer: "https://api.openshift.com/api/upgrades_info/v1/graph", + statusInterval: 15 * time.Second, + minimumUpdateCheckInterval: minimumInterval, + upgradeableCheckIntervals: defaultUpgradeableCheckIntervals(), + payloadDir: overridePayloadDir, + defaultUpstreamServer: "https://api.openshift.com/api/upgrades_info/v1/graph", client: client, kubeClient: kubeClient, diff --git a/pkg/cvo/upgradeable.go b/pkg/cvo/upgradeable.go index 4830c4a1a1..0074e2a547 100644 --- a/pkg/cvo/upgradeable.go +++ b/pkg/cvo/upgradeable.go @@ -29,20 +29,45 @@ import ( const ( adminAckGateFmt string = "^ack-[4-5][.]([0-9]{1,})-[^-]" upgradeableAdminAckRequired = configv1.ClusterStatusConditionType("UpgradeableAdminAckRequired") - checkThrottlePeriod = 2 * time.Minute ) var adminAckGateRegexp = regexp.MustCompile(adminAckGateFmt) -// syncUpgradeable synchronizes the upgradeable status only if it has been more than -// the checkThrottlePeriod since the last synchronization or the precondition -// checks on the payload are failing for less than minimumUpdateCheckInterval, and it has -// been more than the minimumUpgradeableCheckInterval since the last synchronization. -func (optr *Operator) syncUpgradeable(config *configv1.ClusterVersion) error { - u := optr.getUpgradeable() - if u != nil && u.RecentlyChanged(checkThrottlePeriod) && !shouldSyncUpgradeableDueToPreconditionChecks(optr, config, u) { - klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") - return nil +// upgradeableCheckIntervals holds the time intervals that drive how often CVO checks for upgradeability +type upgradeableCheckIntervals struct { + // min is the base minimum interval between upgradeability checks, applied under normal circumstances + min time.Duration + + // minOnFailedPreconditions is the minimum interval between upgradeability checks when precondition checks are + // failing and were recently (see afterPreconditionsFailed) changed. This should be lower than min because we want CVO + // to check upgradeability more often + minOnFailedPreconditions time.Duration + + // afterFailingPreconditions is the period of time after preconditions failed when minOnFailedPreconditions is + // applied instead of min + afterPreconditionsFailed time.Duration +} + +func defaultUpgradeableCheckIntervals() upgradeableCheckIntervals { + return upgradeableCheckIntervals{ + // 2 minutes are here because it is a lower bound of previously nondeterministicly chosen interval + // TODO (OTA-860): Investigate our options of reducing this interval. We will need to investigate + // the API usage patterns of the underlying checks, there is anecdotal evidence that they hit + // apiserver instead of using local informer cache + min: 2 * time.Minute, + minOnFailedPreconditions: 15 * time.Second, + afterPreconditionsFailed: 2 * time.Minute, + } +} + +// syncUpgradeable synchronizes the upgradeable status only if the sufficient time passed since its last update. This +// throttling period is dynamic and is driven by upgradeableCheckIntervals. +func (optr *Operator) syncUpgradeable(cv *configv1.ClusterVersion) error { + if u := optr.getUpgradeable(); u != nil { + if u.RecentlyChanged(optr.upgradeableCheckIntervals.throttlePeriod(cv)) { + klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") + return nil + } } optr.setUpgradeableConditions() @@ -451,21 +476,20 @@ func (optr *Operator) adminGatesEventHandler() cache.ResourceEventHandler { } } -// shouldSyncUpgradeableDueToPreconditionChecks checks if the upgradeable status should -// be synchronized due to the precondition checks. It checks whether the precondition -// checks on the payload are failing for less than minimumUpdateCheckInterval, and it has -// been more than the minimumUpgradeableCheckInterval since the last synchronization. -// This means, upon precondition failure, we will synchronize the upgradeable status -// more frequently at beginning of an upgrade. +// throttlePeriod returns the duration for which upgradeable status should be considered recent +// enough and unnecessary to update. The baseline duration is min. When the precondition checks +// on the payload are failing for less than afterPreconditionsFailed we want to synchronize +// the upgradeable status more frequently at beginning of an upgrade and return +// minOnFailedPreconditions which is expected to be lower than min. // -// shouldSyncUpgradeableDueToPreconditionChecks expects the parameters not to be nil. -// -// Function returns true if the synchronization should happen, returns false otherwise. -func shouldSyncUpgradeableDueToPreconditionChecks(optr *Operator, config *configv1.ClusterVersion, u *upgradeable) bool { - cond := resourcemerge.FindOperatorStatusCondition(config.Status.Conditions, DesiredReleaseAccepted) - if cond != nil && cond.Reason == "PreconditionChecks" && cond.Status == configv1.ConditionFalse && - hasPassedDurationSinceTime(u.At, optr.minimumUpgradeableCheckInterval) && !hasPassedDurationSinceTime(cond.LastTransitionTime.Time, optr.minimumUpdateCheckInterval) { - return true - } - return false +// The cv parameter is expected to be non-nil. +func (intervals *upgradeableCheckIntervals) throttlePeriod(cv *configv1.ClusterVersion) time.Duration { + if cond := resourcemerge.FindOperatorStatusCondition(cv.Status.Conditions, DesiredReleaseAccepted); cond != nil { + // Function returns true if the synchronization should happen, returns false otherwise. + if cond.Reason == "PreconditionChecks" && cond.Status == configv1.ConditionFalse && + !hasPassedDurationSinceTime(cond.LastTransitionTime.Time, intervals.afterPreconditionsFailed) { + return intervals.minOnFailedPreconditions + } + } + return intervals.min } diff --git a/pkg/cvo/upgradeable_test.go b/pkg/cvo/upgradeable_test.go new file mode 100644 index 0000000000..ec9e41264f --- /dev/null +++ b/pkg/cvo/upgradeable_test.go @@ -0,0 +1,59 @@ +package cvo + +import ( + "testing" + "time" + + configv1 "github.com/openshift/api/config/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestUpgradeableCheckIntervalsThrottlePeriod(t *testing.T) { + intervals := defaultUpgradeableCheckIntervals() + testCases := []struct { + name string + condition *configv1.ClusterOperatorStatusCondition + expected time.Duration + }{ + { + name: "no condition", + expected: intervals.min, + }, + { + name: "passing preconditions", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "PreconditionChecks", Status: configv1.ConditionTrue, LastTransitionTime: metav1.Now()}, + expected: intervals.min, + }, + { + name: "failing but not precondition", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "NotPreconditionChecks", Status: configv1.ConditionFalse, LastTransitionTime: metav1.Now()}, + expected: intervals.min, + }, + { + name: "failing preconditions but too long ago", + condition: &configv1.ClusterOperatorStatusCondition{ + Type: DesiredReleaseAccepted, + Reason: "PreconditionChecks", + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now().Add(-(intervals.afterPreconditionsFailed + time.Hour))), + }, + expected: intervals.min, + }, + { + name: "failing preconditions recently", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "PreconditionChecks", Status: configv1.ConditionFalse, LastTransitionTime: metav1.Now()}, + expected: intervals.minOnFailedPreconditions, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + cv := &configv1.ClusterVersion{Status: configv1.ClusterVersionStatus{Conditions: []configv1.ClusterOperatorStatusCondition{}}} + if tc.condition != nil { + cv.Status.Conditions = append(cv.Status.Conditions, *tc.condition) + } + if actual := intervals.throttlePeriod(cv); actual != tc.expected { + t.Errorf("throttlePeriod() = %v, want %v", actual, tc.expected) + } + }) + } +} From 2234774f35192b686157c3e96e344fcdc39529dd Mon Sep 17 00:00:00 2001 From: Lalatendu Mohanty Date: Fri, 3 Mar 2023 11:00:38 -0500 Subject: [PATCH 07/21] OCPBUGS-8304: Adding admin-gate ack-4.12-kube-1.26-api-removals-in-4.13 Signed-off-by: Lalatendu Mohanty --- ...000_00_cluster-version-operator_01_admingate_configmap.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/install/0000_00_cluster-version-operator_01_admingate_configmap.yaml b/install/0000_00_cluster-version-operator_01_admingate_configmap.yaml index a3387fd866..f468c90771 100644 --- a/install/0000_00_cluster-version-operator_01_admingate_configmap.yaml +++ b/install/0000_00_cluster-version-operator_01_admingate_configmap.yaml @@ -1,5 +1,8 @@ apiVersion: v1 kind: ConfigMap +data: + ack-4.12-kube-1.26-api-removals-in-4.13: |- + Kubernetes 1.26 and therefore OpenShift 4.13 remove several APIs which require admin consideration. Please see the knowledge article https://access.redhat.com/articles/6958394 for details and instructions. metadata: name: admin-gates namespace: openshift-config-managed From ebda83af7502d46ba13206b89bc8788d2d7ca7d5 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 6 Mar 2023 05:21:30 -0800 Subject: [PATCH 08/21] pkg/cvo/availableupdates: Prioritize conditional risks for largest target version When changing channels it's possible that multiple new conditional update risks will need to be evaluated. For instance, a cluster running 4.10.34 in a 4.10 channel might only have to evaluate OpenStackNodeCreationFails. But when the channel is changed to a 4.11 channel multiple new risks require evaluation and the evaluation of new risks was throttled at one every 10 minutes. This means if there are three new risks it may take up to 20 minutes after the channel has changed for the full set of conditional updates to be computed. With this commit, I'm sorting the conditional updates in version-descending order, which is the order we've used in the ClusterVersion status since c9dd4792f3ef (pkg/cvo/availableupdates: Sort (conditional)updates, 2021-09-29, #663). This prioritizes the longest-hop risks. For example, 4.10.34 currently has the following updates: * 4.10.(z!=38): no risks * 4.10.38: OpenStackNodeCreationFails * 4.11.(z<10): ARM64SecCompError524, AWSOldBootImagesLackAfterburn, MachineConfigRenderingChurn, OVNNetworkPolicyLongName * 4.11.(10<=z<26): ARM64SecCompError524, AWSOldBootImagesLackAfterburn, MachineConfigRenderingChurn * 4.11.26: ARM64SecCompError524, AWSOldBootImagesLackAfterburn * 4.11.(27<=z<...): AWSOldBootImagesLackAfterburn By focusing on the largest target (say 4.11.30), we'd evaluate AWSOldBootImagesLackAfterburn first. If it did not match the current cluster, 4.11.27 and later would be quickly recommended. It would take another 10m before the self-throttling allowed us to evaluate ARM64SecCompError524, and once we had, that would unblock 4.11.26. Ten minutes after that, we'd evaluate MachineConfigRenderingChurn, and unblock 4.11.(10<=z<26). And so on. But folks on 4.10.34 today are much more likely to be interested in 4.11.30 and other tip releases than they are to care about 4.11.10 and other relatively old releases. --- pkg/cvo/availableupdates.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 8c24a79a24..2c9d13469e 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -282,6 +282,12 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { return } + sort.Slice(u.ConditionalUpdates, func(i, j int) bool { + vi := semver.MustParse(u.ConditionalUpdates[i].Release.Version) + vj := semver.MustParse(u.ConditionalUpdates[j].Release.Version) + return vi.GTE(vj) + }) + for i, conditionalUpdate := range u.ConditionalUpdates { if errorCondition := evaluateConditionalUpdate(ctx, &conditionalUpdate); errorCondition != nil { meta.SetStatusCondition(&conditionalUpdate.Conditions, *errorCondition) From 94d33448eb247804032fc76069e02e2e38ddf482 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Fri, 3 Feb 2023 01:46:04 +0100 Subject: [PATCH 09/21] RetrievePayload: improve testability and add tests --- pkg/cvo/updatepayload.go | 34 ++-- pkg/cvo/updatepayload_test.go | 346 ++++++++++++++++++++++++++++++++++ 2 files changed, 369 insertions(+), 11 deletions(-) create mode 100644 pkg/cvo/updatepayload_test.go diff --git a/pkg/cvo/updatepayload.go b/pkg/cvo/updatepayload.go index 7b2d1c79a3..f375c053f1 100644 --- a/pkg/cvo/updatepayload.go +++ b/pkg/cvo/updatepayload.go @@ -38,14 +38,16 @@ func (optr *Operator) defaultPayloadDir() string { func (optr *Operator) defaultPayloadRetriever() PayloadRetriever { return &payloadRetriever{ - kubeClient: optr.kubeClient, - operatorName: optr.name, - releaseImage: optr.release.Image, - namespace: optr.namespace, - nodeName: optr.nodename, - payloadDir: optr.defaultPayloadDir(), - workingDir: targetUpdatePayloadsDir, - verifier: optr.verifier, + kubeClient: optr.kubeClient, + operatorName: optr.name, + releaseImage: optr.release.Image, + namespace: optr.namespace, + nodeName: optr.nodename, + payloadDir: optr.defaultPayloadDir(), + workingDir: targetUpdatePayloadsDir, + verifier: optr.verifier, + verifyTimeoutOnForce: 2 * time.Minute, + downloadTimeout: 2 * time.Minute, } } @@ -53,6 +55,8 @@ const ( targetUpdatePayloadsDir = "/etc/cvo/updatepayloads" ) +type downloadFunc func(context.Context, configv1.Update) (string, error) + type payloadRetriever struct { // releaseImage and payloadDir are the default payload identifiers - updates that point // to releaseImage will always use the contents of payloadDir @@ -67,7 +71,11 @@ type payloadRetriever struct { operatorName string // verifier guards against invalid remote data being accessed - verifier verify.Interface + verifier verify.Interface + verifyTimeoutOnForce time.Duration + + downloader downloadFunc + downloadTimeout time.Duration } func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1.Update) (PayloadInfo, error) { @@ -94,7 +102,7 @@ func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1. // if 'force' specified, ensure call to verify payload signature times out well before parent context // to allow time to perform forced update if update.Force { - timeout := time.Minute * 2 + timeout := r.verifyTimeoutOnForce if deadline, deadlineSet := ctx.Deadline(); deadlineSet { timeout = time.Until(deadline) / 2 } @@ -120,9 +128,13 @@ func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1. info.Verified = true } + if r.downloader == nil { + r.downloader = r.targetUpdatePayloadDir + } + // download the payload to the directory var err error - info.Directory, err = r.targetUpdatePayloadDir(ctx, update) + info.Directory, err = r.downloader(ctx, update) if err != nil { return PayloadInfo{}, &payload.UpdateError{ Reason: "UpdatePayloadRetrievalFailed", diff --git a/pkg/cvo/updatepayload_test.go b/pkg/cvo/updatepayload_test.go new file mode 100644 index 0000000000..437987a6aa --- /dev/null +++ b/pkg/cvo/updatepayload_test.go @@ -0,0 +1,346 @@ +package cvo + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/library-go/pkg/verify/store" + "golang.org/x/crypto/openpgp" + + "github.com/openshift/cluster-version-operator/pkg/payload" +) + +// EquateErrorMessage reports errors to be equal if both are nil +// or both have the same message +var EquateErrorMessage = cmp.FilterValues(func(x, y interface{}) bool { + _, ok1 := x.(error) + _, ok2 := y.(error) + return ok1 && ok2 +}, cmp.Comparer(func(x, y interface{}) bool { + xe := x.(error) + ye := y.(error) + if xe == nil || ye == nil { + return xe == nil && ye == nil + } + return xe.Error() == ye.Error() +})) + +// mockVerifier implements verify.Verifier +type mockVerifier struct { + t *testing.T + + expectNoVerify bool + expectVerifyDigest string + expectVerifyCancel bool + verifyReturns error +} + +func (m *mockVerifier) Verify(ctx context.Context, releaseDigest string) error { + if m.expectNoVerify { + m.t.Errorf("Unexpected call: Verify(releaseDigest=%s)", releaseDigest) + } + if !m.expectNoVerify && m.expectVerifyDigest != releaseDigest { + m.t.Errorf("Verify() called with unexpected value: %v", releaseDigest) + } + + timeout, timeoutCancel := context.WithTimeout(context.Background(), backstopDuration) + defer timeoutCancel() + + if m.expectVerifyCancel { + select { + case <-ctx.Done(): + case <-timeout.Done(): + m.t.Errorf("Verify() expected to be cancelled by context but it was not") + } + } else { + select { + case <-ctx.Done(): + m.t.Errorf("Unexpected ctx cancel in Verify()") + default: + } + } + + return m.verifyReturns +} +func (m *mockVerifier) Signatures() map[string][][]byte { return nil } +func (m *mockVerifier) Verifiers() map[string]openpgp.EntityList { return nil } +func (m *mockVerifier) AddStore(_ store.Store) {} + +type downloadMocker struct { + expectCancel bool + duration time.Duration + returnLocation string + returnErr error +} + +const ( + // backstopDuration is a maximum duration for which we wait on a tested operation + backstopDuration = 5 * time.Second + // hangs represents a "hanging" operation, always preempted by backstop + hangs = 2 * backstopDuration +) + +func (d *downloadMocker) make(t *testing.T) downloadFunc { + if d == nil { + return func(_ context.Context, update configv1.Update) (string, error) { + t.Errorf("Unexpected call: downloader(, upddate=%v", update) + return "", nil + } + } + return func(ctx context.Context, _ configv1.Update) (string, error) { + backstopCtx, backstopCancel := context.WithTimeout(context.Background(), backstopDuration) + defer backstopCancel() + + downloadCtx, downloadCancel := context.WithTimeout(context.Background(), d.duration) + defer downloadCancel() + + if d.expectCancel { + select { + case <-backstopCtx.Done(): + t.Errorf("downloader: test backstop hit (expected cancel via ctx)") + return "", errors.New("downloader: test backstop hit (expected cancel via ctx)") + case <-downloadCtx.Done(): + t.Errorf("downloader: download finished (expected cancel via ctx)") + return "/some/location", errors.New("downloader: download finished (expected cancel via ctx)") + case <-ctx.Done(): + } + } else { + select { + case <-backstopCtx.Done(): + t.Errorf("downloader: test backstop hit (expected download to finish)") + return "", errors.New("downloader: test backstop hit (expected download to finish)") + case <-ctx.Done(): + t.Errorf("downloader: unexpected ctx cancel (expected download to finish)") + return "", errors.New("downloader: unexpected ctx cancel (expected download to finish)") + case <-downloadCtx.Done(): + } + } + + return d.returnLocation, d.returnErr + } +} + +func TestPayloadRetrieverRetrievePayload(t *testing.T) { + t.Parallel() + testCases := []struct { + name string + verifier *mockVerifier + downloader *downloadMocker + update configv1.Update + ctxTimeout time.Duration + + expected PayloadInfo + expectedErr error + }{ + { + name: "when desired image matches retriever image then return local payload directory", + verifier: &mockVerifier{expectNoVerify: true}, + update: configv1.Update{Image: "releaseImage"}, + expected: PayloadInfo{ + Directory: "/local/payload/dir", + Local: true, + }, + }, + { + name: "when desired image is empty then return error", + verifier: &mockVerifier{expectNoVerify: true}, + update: configv1.Update{}, + expectedErr: errors.New("no payload image has been specified and the contents of the payload cannot be retrieved"), + }, + { + name: "when desired image is tag pullspec and passes verification but fails to download then return error", + verifier: &mockVerifier{expectVerifyDigest: ""}, + downloader: &downloadMocker{returnErr: errors.New("fails to download")}, + update: configv1.Update{Image: "quay.io/openshift-release-dev/ocp-release:failing"}, + expectedErr: errors.New("Unable to download and prepare the update: fails to download"), + }, + { + name: "when desired image is tag pullspec and fails verification then return error", + verifier: &mockVerifier{ + expectVerifyDigest: "", + verifyReturns: errors.New("fails-verification"), + }, + update: configv1.Update{Image: "quay.io/openshift-release-dev/ocp-release:failing"}, + expectedErr: errors.New("The update cannot be verified: fails-verification"), + }, + { + name: "when desired image is sha digest pullspec and passes verification but fails to download then return error", + verifier: &mockVerifier{expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4"}, + downloader: &downloadMocker{returnErr: errors.New("fails to download")}, + update: configv1.Update{Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4"}, + expectedErr: errors.New("Unable to download and prepare the update: fails to download"), + }, + { + name: "when sha digest pullspec image fails verification but update is forced then retrieval proceeds then when download fails then return error", + verifier: &mockVerifier{ + expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + verifyReturns: errors.New("fails-to-verify"), + }, + downloader: &downloadMocker{returnErr: errors.New("fails to download")}, + update: configv1.Update{ + Force: true, + Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + }, + expectedErr: errors.New("Unable to download and prepare the update: fails to download"), + }, + { + name: "when sha digest pullspec image is timing out verification with unlimited context and update is forced " + + "then verification times out promptly and retrieval proceeds but download fails then return error", + verifier: &mockVerifier{ + expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + expectVerifyCancel: true, + verifyReturns: errors.New("fails-to-verify"), + }, + downloader: &downloadMocker{returnErr: errors.New("fails to download")}, + update: configv1.Update{ + Force: true, + Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + }, + expectedErr: errors.New("Unable to download and prepare the update: fails to download"), + }, + { + name: "when sha digest pullspec image is timing out verification with long deadline context and update is forced " + + "then verification times out promptly, retrieval proceeds but download fails then return error", + verifier: &mockVerifier{ + expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + expectVerifyCancel: true, + verifyReturns: errors.New("fails-to-verify"), + }, + downloader: &downloadMocker{returnErr: errors.New("fails to download")}, + update: configv1.Update{ + Force: true, + Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + }, + ctxTimeout: backstopDuration - time.Second, + expectedErr: errors.New("Unable to download and prepare the update: fails to download"), + }, + { + name: "when sha digest pullspec image is timing out verification with long deadline context and update is forced " + + "then verification times out promptly, retrieval proceeds but download times out then return error", + verifier: &mockVerifier{ + expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + expectVerifyCancel: true, + verifyReturns: errors.New("fails-to-verify"), + }, + downloader: &downloadMocker{ + expectCancel: true, + duration: backstopDuration - (2 * time.Second), + returnErr: errors.New("fails to download"), + }, + update: configv1.Update{ + Force: true, + Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + }, + ctxTimeout: backstopDuration - time.Second, + expectedErr: errors.New("Unable to download and prepare the update: fails to download"), + }, + { + name: "when sha digest pullspec image fails verification but update is forced then retrieval proceeds and download succeeds then return info with location and verification error", + verifier: &mockVerifier{ + expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + verifyReturns: errors.New("fails-to-verify"), + }, + downloader: &downloadMocker{returnLocation: "/location/of/download"}, + update: configv1.Update{ + Force: true, + Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + }, + expected: PayloadInfo{ + Directory: "/location/of/download", + VerificationError: &payload.UpdateError{ + Reason: "ImageVerificationFailed", + Message: `Target release version="" image="quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4" cannot be verified, but continuing anyway because the update was forced: fails-to-verify`, + }, + }, + }, + { + name: "when sha digest pullspec image passes and download hangs then it is terminated and returns error (RHBZ#2090680)", + verifier: &mockVerifier{expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4"}, + downloader: &downloadMocker{ + duration: hangs, + expectCancel: true, + returnErr: errors.New("download was canceled"), + }, + update: configv1.Update{ + Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + }, + expectedErr: errors.New("Unable to download and prepare the update: download was canceled"), + }, + { + name: "when sha digest pullspec image fails to verify until timeout then it allows enough time for download and it returns successfully", + verifier: &mockVerifier{ + expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + expectVerifyCancel: true, + verifyReturns: errors.New("fails-to-verify"), + }, + downloader: &downloadMocker{ + duration: 300 * time.Millisecond, + returnLocation: "/location/of/download", + }, + update: configv1.Update{ + Force: true, + Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + }, + expected: PayloadInfo{ + Directory: "/location/of/download", + VerificationError: &payload.UpdateError{ + Reason: "ImageVerificationFailed", + Message: `Target release version="" image="quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4" cannot be verified, but continuing anyway because the update was forced: fails-to-verify`, + }, + }, + }, + { + name: "when sha digest pullspec image passes and download succeeds then returns location and no error", + verifier: &mockVerifier{expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4"}, + downloader: &downloadMocker{returnLocation: "/location/of/download"}, + update: configv1.Update{ + Image: "quay.io/openshift-release-dev/ocp-release@sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", + }, + expected: PayloadInfo{ + Directory: "/location/of/download", + Verified: true, + }, + }, + } + for _, tc := range testCases { + tc := tc // prevent parallel closures from sharing a single tc copy + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + retriever := payloadRetriever{ + releaseImage: "releaseImage", + payloadDir: "/local/payload/dir", + verifyTimeoutOnForce: time.Second, + downloadTimeout: time.Second, + } + + if tc.verifier != nil { + tc.verifier.t = t + retriever.verifier = tc.verifier + } + + if tc.downloader != nil { + retriever.downloader = tc.downloader.make(t) + } + + ctx := context.Background() + if tc.ctxTimeout > 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, tc.ctxTimeout) + defer cancel() + } + + actual, err := retriever.RetrievePayload(ctx, tc.update) + if diff := cmp.Diff(tc.expectedErr, err, EquateErrorMessage); diff != "" { + t.Errorf("Returned error differs from expected:\n%s", diff) + } + if diff := cmp.Diff(tc.expected, actual, cmpopts.IgnoreFields(payload.UpdateError{}, "Nested")); err == nil && diff != "" { + t.Errorf("Returned PayloadInfo differs from expected:\n%s", diff) + } + }) + } +} From c78e46e93765e4c238b55b4fc7e873c7b968f3f5 Mon Sep 17 00:00:00 2001 From: Jack Ottofaro Date: Fri, 3 Feb 2023 02:13:13 +0100 Subject: [PATCH 10/21] Bug 2090680: pkg/cvo/updatepayload.go: timeout payload retrieval When we separated payload load from payload apply (#683) the context used for the retrieval changed as well. It went from one that was constrained by syncTimeout (2 -4 minutes) [1] to being the unconstrained shutdownContext [2]. However if "force" is specified we explicitly set a 2 minute timeout in RetrievePayload. This commit creates a new context with a reasonable timeout for RetrievePayload regardless of "force". [1] https://github.com/openshift/cluster-version-operator/blob/57ffa7c610fb92ef4ccd9e9c49e75915e86e9296/pkg/cvo/sync_worker.go#L605 [2] https://github.com/openshift/cluster-version-operator/blob/57ffa7c610fb92ef4ccd9e9c49e75915e86e9296/pkg/cvo/cvo.go#L413 --- pkg/cvo/updatepayload.go | 22 ++++++---------------- pkg/cvo/updatepayload_test.go | 2 +- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/pkg/cvo/updatepayload.go b/pkg/cvo/updatepayload.go index f375c053f1..911fca1f9b 100644 --- a/pkg/cvo/updatepayload.go +++ b/pkg/cvo/updatepayload.go @@ -97,22 +97,12 @@ func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1. if index := strings.LastIndex(update.Image, "@"); index != -1 { releaseDigest = update.Image[index+1:] } - verifyCtx := ctx - - // if 'force' specified, ensure call to verify payload signature times out well before parent context - // to allow time to perform forced update - if update.Force { - timeout := r.verifyTimeoutOnForce - if deadline, deadlineSet := ctx.Deadline(); deadlineSet { - timeout = time.Until(deadline) / 2 - } - klog.V(2).Infof("Forced update so reducing payload signature verification timeout to %s", timeout) - var cancel context.CancelFunc - verifyCtx, cancel = context.WithTimeout(ctx, timeout) - defer cancel() - } - if err := r.verifier.Verify(verifyCtx, releaseDigest); err != nil { + // set up a new context with reasonable timeout for signature and payload retrieval + retrieveCtx, cancel := context.WithTimeout(ctx, r.verifyTimeoutOnForce+r.downloadTimeout) + defer cancel() + + if err := r.verifier.Verify(retrieveCtx, releaseDigest); err != nil { vErr := &payload.UpdateError{ Reason: "ImageVerificationFailed", Message: fmt.Sprintf("The update cannot be verified: %v", err), @@ -134,7 +124,7 @@ func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1. // download the payload to the directory var err error - info.Directory, err = r.downloader(ctx, update) + info.Directory, err = r.downloader(retrieveCtx, update) if err != nil { return PayloadInfo{}, &payload.UpdateError{ Reason: "UpdatePayloadRetrievalFailed", diff --git a/pkg/cvo/updatepayload_test.go b/pkg/cvo/updatepayload_test.go index 437987a6aa..daf3361caf 100644 --- a/pkg/cvo/updatepayload_test.go +++ b/pkg/cvo/updatepayload_test.go @@ -272,7 +272,7 @@ func TestPayloadRetrieverRetrievePayload(t *testing.T) { expectedErr: errors.New("Unable to download and prepare the update: download was canceled"), }, { - name: "when sha digest pullspec image fails to verify until timeout then it allows enough time for download and it returns successfully", + name: "when sha digest pullspec image fails to verify until timeout but is forced then it allows enough time for download and it returns successfully", verifier: &mockVerifier{ expectVerifyDigest: "sha256:08ef16270e643a001454410b22864db6246d782298be267688a4433d83f404f4", expectVerifyCancel: true, From fc6417aed94693f4fd634d89205da1b80b715b60 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Fri, 3 Feb 2023 02:26:20 +0100 Subject: [PATCH 11/21] RetrievePayload: Improve timeouts for corner cases The `RetrievePayload` performs two operations: verification and download. Both can take a non-trivial amount of time to terminate, up to "hanging" where CVO needs to abort the operation. The verification result can be ignored when upgrade is forced. The CVO calls `RetrievePayload` with a context that does not set a deadline, so `RetrievePayload` previously set its own internal deadline, common for both operations. This led to a suboptimal behavior on forced upgrades, where "hanging" verification could eat the whole timeout budget, got cancelled but its result was ignored (because of force). The code tried to proceed with download but that immediately aborts because of the expired context. Improve timeouts in `RetrievePayload` for both input context states: with and without deadline. If the input context sets a deadline, it is respected. If it does not, the default, separate deadlines are applied for both operations. In both cases, the code makes sure the hanging verification never spends the whole budget. When verification terminates fast, the rest of its alloted time is provided to the download operation. --- pkg/cvo/updatepayload.go | 25 +++++++++++++++++-------- pkg/cvo/updatepayload_test.go | 2 ++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pkg/cvo/updatepayload.go b/pkg/cvo/updatepayload.go index 911fca1f9b..024a8721da 100644 --- a/pkg/cvo/updatepayload.go +++ b/pkg/cvo/updatepayload.go @@ -90,19 +90,29 @@ func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1. return PayloadInfo{}, fmt.Errorf("no payload image has been specified and the contents of the payload cannot be retrieved") } - var info PayloadInfo - // verify the provided payload var releaseDigest string if index := strings.LastIndex(update.Image, "@"); index != -1 { releaseDigest = update.Image[index+1:] } - // set up a new context with reasonable timeout for signature and payload retrieval - retrieveCtx, cancel := context.WithTimeout(ctx, r.verifyTimeoutOnForce+r.downloadTimeout) - defer cancel() + var downloadCtx context.Context + var downloadCtxCancel context.CancelFunc + var verifyTimeout time.Duration - if err := r.verifier.Verify(retrieveCtx, releaseDigest); err != nil { + if deadline, ok := ctx.Deadline(); ok { + verifyTimeout = time.Until(deadline) / 2 + downloadCtx = ctx + } else { + verifyTimeout = r.verifyTimeoutOnForce + downloadCtx, downloadCtxCancel = context.WithTimeout(ctx, r.verifyTimeoutOnForce+r.downloadTimeout) + defer downloadCtxCancel() + } + verifyCtx, verifyCtxCancel := context.WithTimeout(ctx, verifyTimeout) + defer verifyCtxCancel() + + var info PayloadInfo + if err := r.verifier.Verify(verifyCtx, releaseDigest); err != nil { vErr := &payload.UpdateError{ Reason: "ImageVerificationFailed", Message: fmt.Sprintf("The update cannot be verified: %v", err), @@ -121,10 +131,9 @@ func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1. if r.downloader == nil { r.downloader = r.targetUpdatePayloadDir } - // download the payload to the directory var err error - info.Directory, err = r.downloader(retrieveCtx, update) + info.Directory, err = r.downloader(downloadCtx, update) if err != nil { return PayloadInfo{}, &payload.UpdateError{ Reason: "UpdatePayloadRetrievalFailed", diff --git a/pkg/cvo/updatepayload_test.go b/pkg/cvo/updatepayload_test.go index daf3361caf..e20636d70d 100644 --- a/pkg/cvo/updatepayload_test.go +++ b/pkg/cvo/updatepayload_test.go @@ -10,6 +10,8 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" configv1 "github.com/openshift/api/config/v1" "github.com/openshift/library-go/pkg/verify/store" + + //nolint:staticcheck // verify,Verifier from openshift/library-go uses a type from this deprecated package (needs to be addressed there) "golang.org/x/crypto/openpgp" "github.com/openshift/cluster-version-operator/pkg/payload" From c484ef9b2cc1894e2df2303c97f09844ff061486 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Tue, 14 Feb 2023 18:35:45 +0100 Subject: [PATCH 12/21] RetrievePayload: improve godocs and refactor (address review) --- pkg/cvo/updatepayload.go | 55 ++++++++++++++++++++++------------- pkg/cvo/updatepayload_test.go | 7 ++--- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/pkg/cvo/updatepayload.go b/pkg/cvo/updatepayload.go index 024a8721da..702f195028 100644 --- a/pkg/cvo/updatepayload.go +++ b/pkg/cvo/updatepayload.go @@ -38,16 +38,15 @@ func (optr *Operator) defaultPayloadDir() string { func (optr *Operator) defaultPayloadRetriever() PayloadRetriever { return &payloadRetriever{ - kubeClient: optr.kubeClient, - operatorName: optr.name, - releaseImage: optr.release.Image, - namespace: optr.namespace, - nodeName: optr.nodename, - payloadDir: optr.defaultPayloadDir(), - workingDir: targetUpdatePayloadsDir, - verifier: optr.verifier, - verifyTimeoutOnForce: 2 * time.Minute, - downloadTimeout: 2 * time.Minute, + kubeClient: optr.kubeClient, + operatorName: optr.name, + releaseImage: optr.release.Image, + namespace: optr.namespace, + nodeName: optr.nodename, + payloadDir: optr.defaultPayloadDir(), + workingDir: targetUpdatePayloadsDir, + verifier: optr.verifier, + retrieveTimeout: 4 * time.Minute, } } @@ -55,6 +54,10 @@ const ( targetUpdatePayloadsDir = "/etc/cvo/updatepayloads" ) +// downloadFunc downloads the requested update and returns either a path on the local filesystem +// containing extracted manifests or an error +// The type exists so that tests for payloadRetriever.RetrievePayload can mock this functionality +// by setting payloadRetriever.downloader. type downloadFunc func(context.Context, configv1.Update) (string, error) type payloadRetriever struct { @@ -71,13 +74,23 @@ type payloadRetriever struct { operatorName string // verifier guards against invalid remote data being accessed - verifier verify.Interface - verifyTimeoutOnForce time.Duration + verifier verify.Interface - downloader downloadFunc - downloadTimeout time.Duration + // downloader is called to download the requested update to local filesystem. It should only be + // set to non-nil value in tests. When this is nil, payloadRetriever.targetUpdatePayloadDir + // is called as a downloader. + downloader downloadFunc + + // retrieveTimeout limits the time spent in payloadRetriever.RetrievePayload. This timeout is only + // applied when the context passed into that method is unbounded; otherwise the method respects + // the deadline set in the input context. + retrieveTimeout time.Duration } +// RetrievePayload verifies, downloads and extracts to local filesystem the payload image specified +// by update. If the input context has a deadline, it is respected, otherwise r.retrieveTimeout +// applies. When update.Force is true, the verification is still performed, but the method proceeds +// even when the image cannot be verified successfully. func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1.Update) (PayloadInfo, error) { if r.releaseImage == update.Image { return PayloadInfo{ @@ -97,17 +110,17 @@ func (r *payloadRetriever) RetrievePayload(ctx context.Context, update configv1. } var downloadCtx context.Context - var downloadCtxCancel context.CancelFunc - var verifyTimeout time.Duration - - if deadline, ok := ctx.Deadline(); ok { - verifyTimeout = time.Until(deadline) / 2 + downloadDeadline, ok := ctx.Deadline() + if ok { downloadCtx = ctx } else { - verifyTimeout = r.verifyTimeoutOnForce - downloadCtx, downloadCtxCancel = context.WithTimeout(ctx, r.verifyTimeoutOnForce+r.downloadTimeout) + downloadDeadline = time.Now().Add(r.retrieveTimeout) + var downloadCtxCancel context.CancelFunc + downloadCtx, downloadCtxCancel = context.WithDeadline(ctx, downloadDeadline) defer downloadCtxCancel() } + + verifyTimeout := time.Until(downloadDeadline) / 2 verifyCtx, verifyCtxCancel := context.WithTimeout(ctx, verifyTimeout) defer verifyCtxCancel() diff --git a/pkg/cvo/updatepayload_test.go b/pkg/cvo/updatepayload_test.go index e20636d70d..a93b463a7d 100644 --- a/pkg/cvo/updatepayload_test.go +++ b/pkg/cvo/updatepayload_test.go @@ -314,10 +314,9 @@ func TestPayloadRetrieverRetrievePayload(t *testing.T) { t.Run(tc.name, func(t *testing.T) { t.Parallel() retriever := payloadRetriever{ - releaseImage: "releaseImage", - payloadDir: "/local/payload/dir", - verifyTimeoutOnForce: time.Second, - downloadTimeout: time.Second, + releaseImage: "releaseImage", + payloadDir: "/local/payload/dir", + retrieveTimeout: 2 * time.Second, } if tc.verifier != nil { From 5602fa1d18be853ade9292c43f5362ab8bb41517 Mon Sep 17 00:00:00 2001 From: David Eads Date: Wed, 29 Mar 2023 16:14:47 -0400 Subject: [PATCH 13/21] Update dnsPolicy to allow consistent resolution of the internal LB The kubelet consistently resolves the name. This change allows the CVO to use the kubelet's DNS configuration. --- install/0000_00_cluster-version-operator_03_deployment.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/install/0000_00_cluster-version-operator_03_deployment.yaml b/install/0000_00_cluster-version-operator_03_deployment.yaml index 7b608829c7..d8a32bd694 100644 --- a/install/0000_00_cluster-version-operator_03_deployment.yaml +++ b/install/0000_00_cluster-version-operator_03_deployment.yaml @@ -66,7 +66,9 @@ spec: fieldPath: spec.nodeName - name: CLUSTER_PROFILE value: {{ .ClusterProfile }} - dnsPolicy: ClusterFirstWithHostNet + # this pod is hostNetwork and uses the internal LB DNS name when possible, which the kubelet also uses. + # this dnsPolicy allows us to use the same dnsConfig as the kubelet, without access to read it ourselves. + dnsPolicy: Default hostNetwork: true nodeSelector: node-role.kubernetes.io/master: "" From 89c986f4276946d64cdfc128056f68c589d01640 Mon Sep 17 00:00:00 2001 From: David Eads Date: Fri, 31 Mar 2023 10:58:52 -0400 Subject: [PATCH 14/21] replace global clustercondition registry with instance based approach Globals cause problems for unit tests (demonstrated in some of the unit tests) and prevent general reuse. The anti-pattern is avoided in kuberentes code and should be OCP as well. This updates to providing a configured condition registry without init blocks and provides a standard clustercondition registry for each consumption. --- cmd/main.go | 3 -- pkg/cincinnati/cincinnati.go | 15 ++++--- pkg/cincinnati/cincinnati_test.go | 6 +-- pkg/clusterconditions/always/always.go | 5 --- pkg/clusterconditions/clusterconditions.go | 45 ++++++++++++++----- .../clusterconditions_test.go | 15 +++---- pkg/clusterconditions/promql/promql.go | 37 +++++++-------- pkg/clusterconditions/standard/standard.go | 16 +++++++ pkg/cvo/availableupdates.go | 15 ++++--- pkg/cvo/cvo.go | 8 ++++ 10 files changed, 103 insertions(+), 62 deletions(-) create mode 100644 pkg/clusterconditions/standard/standard.go diff --git a/cmd/main.go b/cmd/main.go index a11b054c30..1414798925 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -5,9 +5,6 @@ import ( "github.com/spf13/cobra" "k8s.io/klog/v2" - - _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" - _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" ) var ( diff --git a/pkg/cincinnati/cincinnati.go b/pkg/cincinnati/cincinnati.go index 9cdc00e3eb..d3d41e2aa7 100644 --- a/pkg/cincinnati/cincinnati.go +++ b/pkg/cincinnati/cincinnati.go @@ -30,13 +30,18 @@ const ( // Client is a Cincinnati client which can be used to fetch update graphs from // an upstream Cincinnati stack. type Client struct { - id uuid.UUID - transport *http.Transport + id uuid.UUID + transport *http.Transport + conditionRegistry clusterconditions.ConditionRegistry } // NewClient creates a new Cincinnati client with the given client identifier. -func NewClient(id uuid.UUID, transport *http.Transport) Client { - return Client{id: id, transport: transport} +func NewClient(id uuid.UUID, transport *http.Transport, conditionRegistry clusterconditions.ConditionRegistry) Client { + return Client{ + id: id, + transport: transport, + conditionRegistry: conditionRegistry, + } } // Error is returned when are unable to get updates. @@ -216,7 +221,7 @@ func (c Client) GetUpdates(ctx context.Context, uri *url.URL, arch string, chann for i := len(conditionalUpdates) - 1; i >= 0; i-- { for j, risk := range conditionalUpdates[i].Risks { - conditionalUpdates[i].Risks[j].MatchingRules, err = clusterconditions.PruneInvalid(ctx, risk.MatchingRules) + conditionalUpdates[i].Risks[j].MatchingRules, err = c.conditionRegistry.PruneInvalid(ctx, risk.MatchingRules) if len(conditionalUpdates[i].Risks[j].MatchingRules) == 0 { klog.Warningf("Conditional update to %s, risk %q, has empty pruned matchingRules; dropping this target to avoid rejections when pushing to the Kubernetes API server. Pruning results: %s", conditionalUpdates[i].Release.Version, risk.Name, err) conditionalUpdates = append(conditionalUpdates[:i], conditionalUpdates[i+1:]...) diff --git a/pkg/cincinnati/cincinnati_test.go b/pkg/cincinnati/cincinnati_test.go index 28b8193cc6..776257371f 100644 --- a/pkg/cincinnati/cincinnati_test.go +++ b/pkg/cincinnati/cincinnati_test.go @@ -10,11 +10,11 @@ import ( "reflect" "testing" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" + "github.com/blang/semver/v4" "github.com/google/uuid" configv1 "github.com/openshift/api/config/v1" - _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" - _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" _ "k8s.io/klog/v2" // integration tests set glog flags. ) @@ -604,7 +604,7 @@ func TestGetUpdates(t *testing.T) { ts := httptest.NewServer(http.HandlerFunc(handler)) defer ts.Close() - c := NewClient(clientID, nil) + c := NewClient(clientID, nil, standard.NewConditionRegistry(nil)) uri, err := url.Parse(ts.URL) if err != nil { diff --git a/pkg/clusterconditions/always/always.go b/pkg/clusterconditions/always/always.go index 6267761eb7..2c61e1549a 100644 --- a/pkg/clusterconditions/always/always.go +++ b/pkg/clusterconditions/always/always.go @@ -8,7 +8,6 @@ import ( "errors" configv1 "github.com/openshift/api/config/v1" - "github.com/openshift/cluster-version-operator/pkg/clusterconditions" ) // Always implements a cluster condition that always matches. @@ -30,7 +29,3 @@ func (a *Always) Valid(ctx context.Context, condition *configv1.ClusterCondition func (a *Always) Match(ctx context.Context, condition *configv1.ClusterCondition) (bool, error) { return true, nil } - -func init() { - clusterconditions.Register("Always", always) -} diff --git a/pkg/clusterconditions/clusterconditions.go b/pkg/clusterconditions/clusterconditions.go index e532547cbe..d961d67379 100644 --- a/pkg/clusterconditions/clusterconditions.go +++ b/pkg/clusterconditions/clusterconditions.go @@ -25,28 +25,51 @@ type Condition interface { Match(ctx context.Context, condition *configv1.ClusterCondition) (bool, error) } -// Registry is a registry of implemented condition types. -var Registry map[string]Condition +type ConditionRegistry interface { + // Register registers a condition type, and panics on any name collisions. + Register(conditionType string, condition Condition) + + // PruneInvalid returns a new slice with recognized, valid conditions. + // The error complains about any unrecognized or invalid conditions. + PruneInvalid(ctx context.Context, matchingRules []configv1.ClusterCondition) ([]configv1.ClusterCondition, error) + + // Match returns whether the cluster matches the given rules (true), + // does not match (false), or the rules fail to evaluate (error). + Match(ctx context.Context, matchingRules []configv1.ClusterCondition) (bool, error) +} + +type conditionRegistry struct { + // registry is a registry of implemented condition types. + registry map[string]Condition +} + +func NewConditionRegistry() ConditionRegistry { + ret := &conditionRegistry{ + registry: map[string]Condition{}, + } + + return ret +} // Register registers a condition type, and panics on any name collisions. -func Register(conditionType string, condition Condition) { - if Registry == nil { - Registry = make(map[string]Condition, 1) +func (r *conditionRegistry) Register(conditionType string, condition Condition) { + if r.registry == nil { + r.registry = make(map[string]Condition, 1) } - if existing, ok := Registry[conditionType]; ok && condition != existing { + if existing, ok := r.registry[conditionType]; ok && condition != existing { panic(fmt.Sprintf("cluster condition %q already registered", conditionType)) } - Registry[conditionType] = condition + r.registry[conditionType] = condition } // PruneInvalid returns a new slice with recognized, valid conditions. // The error complains about any unrecognized or invalid conditions. -func PruneInvalid(ctx context.Context, matchingRules []configv1.ClusterCondition) ([]configv1.ClusterCondition, error) { +func (r *conditionRegistry) PruneInvalid(ctx context.Context, matchingRules []configv1.ClusterCondition) ([]configv1.ClusterCondition, error) { var valid []configv1.ClusterCondition var errs []error for _, config := range matchingRules { - condition, ok := Registry[config.Type] + condition, ok := r.registry[config.Type] if !ok { errs = append(errs, fmt.Errorf("Skipping unrecognized cluster condition type %q", config.Type)) continue @@ -63,11 +86,11 @@ func PruneInvalid(ctx context.Context, matchingRules []configv1.ClusterCondition // Match returns whether the cluster matches the given rules (true), // does not match (false), or the rules fail to evaluate (error). -func Match(ctx context.Context, matchingRules []configv1.ClusterCondition) (bool, error) { +func (r *conditionRegistry) Match(ctx context.Context, matchingRules []configv1.ClusterCondition) (bool, error) { var errs []error for _, config := range matchingRules { - condition, ok := Registry[config.Type] + condition, ok := r.registry[config.Type] if !ok { klog.V(2).Infof("Skipping unrecognized cluster condition type %q", config.Type) continue diff --git a/pkg/clusterconditions/clusterconditions_test.go b/pkg/clusterconditions/clusterconditions_test.go index 4acb9b0659..d00541306d 100644 --- a/pkg/clusterconditions/clusterconditions_test.go +++ b/pkg/clusterconditions/clusterconditions_test.go @@ -8,10 +8,7 @@ import ( "testing" configv1 "github.com/openshift/api/config/v1" - - "github.com/openshift/cluster-version-operator/pkg/clusterconditions" - _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" - _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" ) // Error implements a cluster condition that always errors. @@ -33,6 +30,7 @@ func (e *Error) Match(ctx context.Context, condition *configv1.ClusterCondition) func TestPruneInvalid(t *testing.T) { ctx := context.Background() + registry := standard.NewConditionRegistry(nil) for _, testCase := range []struct { name string @@ -100,7 +98,7 @@ func TestPruneInvalid(t *testing.T) { }, } { t.Run(testCase.name, func(t *testing.T) { - valid, err := clusterconditions.PruneInvalid(ctx, testCase.conditions) + valid, err := registry.PruneInvalid(ctx, testCase.conditions) if !reflect.DeepEqual(valid, testCase.expectedValid) { t.Errorf("got valid %v but expected %v", valid, testCase.expectedValid) } @@ -117,7 +115,8 @@ func TestPruneInvalid(t *testing.T) { func TestMatch(t *testing.T) { ctx := context.Background() - clusterconditions.Register("Error", &Error{}) + registry := standard.NewConditionRegistry(nil) + registry.Register("Error", &Error{}) for _, testCase := range []struct { name string @@ -181,7 +180,7 @@ func TestMatch(t *testing.T) { }, } { t.Run(testCase.name, func(t *testing.T) { - match, err := clusterconditions.Match(ctx, testCase.conditions) + match, err := registry.Match(ctx, testCase.conditions) if match != testCase.expectedMatch { t.Errorf("got match %t but expected %t", match, testCase.expectedMatch) } @@ -194,6 +193,4 @@ func TestMatch(t *testing.T) { } }) } - - delete(clusterconditions.Registry, "Error") } diff --git a/pkg/clusterconditions/promql/promql.go b/pkg/clusterconditions/promql/promql.go index 5b84069995..5f96ad5119 100644 --- a/pkg/clusterconditions/promql/promql.go +++ b/pkg/clusterconditions/promql/promql.go @@ -16,7 +16,6 @@ import ( "github.com/prometheus/common/model" "k8s.io/klog/v2" - "github.com/openshift/cluster-version-operator/pkg/clusterconditions" "github.com/openshift/cluster-version-operator/pkg/clusterconditions/cache" ) @@ -32,23 +31,25 @@ type PromQL struct { QueryTimeout time.Duration } -var promql = &cache.Cache{ - Condition: &PromQL{ - Address: "https://thanos-querier.openshift-monitoring.svc.cluster.local:9091", - HTTPClientConfig: config.HTTPClientConfig{ - Authorization: &config.Authorization{ - Type: "Bearer", - CredentialsFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", - }, - TLSConfig: config.TLSConfig{ - CAFile: "/etc/tls/service-ca/service-ca.crt", +func NewPromQL() *cache.Cache { + return &cache.Cache{ + Condition: &PromQL{ + HTTPClientConfig: config.HTTPClientConfig{ + Authorization: &config.Authorization{ + Type: "Bearer", + CredentialsFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", + }, + TLSConfig: config.TLSConfig{ + CAFile: "/etc/tls/service-ca/service-ca.crt", + ServerName: "thanos-querier.openshift-monitoring.svc.cluster.local", + }, }, + QueryTimeout: 5 * time.Minute, }, - QueryTimeout: 5 * time.Minute, - }, - MinBetweenMatches: 10 * time.Minute, - MinForCondition: time.Hour, - Expiration: 24 * time.Hour, + MinBetweenMatches: 10 * time.Minute, + MinForCondition: time.Hour, + Expiration: 24 * time.Hour, + } } // Valid returns an error if the condition contains any properties @@ -122,7 +123,3 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition } return false, fmt.Errorf("invalid PromQL result (must be 0 or 1): %v", sample.Value) } - -func init() { - clusterconditions.Register("PromQL", promql) -} diff --git a/pkg/clusterconditions/standard/standard.go b/pkg/clusterconditions/standard/standard.go new file mode 100644 index 0000000000..bc3df446b0 --- /dev/null +++ b/pkg/clusterconditions/standard/standard.go @@ -0,0 +1,16 @@ +package standard + +import ( + "github.com/openshift/cluster-version-operator/pkg/clusterconditions" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" + "k8s.io/client-go/kubernetes" +) + +func NewConditionRegistry(kubeClient kubernetes.Interface) clusterconditions.ConditionRegistry { + conditionRegistry := clusterconditions.NewConditionRegistry() + conditionRegistry.Register("Always", &always.Always{}) + conditionRegistry.Register("PromQL", promql.NewPromQL()) + + return conditionRegistry +} diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 2c9d13469e..cdbd01d422 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -62,7 +62,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 return err } - current, updates, conditionalUpdates, condition := calculateAvailableUpdatesStatus(ctx, string(config.Spec.ClusterID), transport, upstream, arch, channel, optr.release.Version) + current, updates, conditionalUpdates, condition := calculateAvailableUpdatesStatus(ctx, string(config.Spec.ClusterID), transport, upstream, arch, channel, optr.release.Version, optr.conditionRegistry) if usedDefaultUpstream { upstream = "" @@ -75,6 +75,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 Current: current, Updates: updates, ConditionalUpdates: conditionalUpdates, + ConditionRegistry: optr.conditionRegistry, Condition: condition, } @@ -109,6 +110,7 @@ type availableUpdates struct { Current configv1.Release Updates []configv1.Release ConditionalUpdates []configv1.ConditionalUpdate + ConditionRegistry clusterconditions.ConditionRegistry Condition configv1.ClusterOperatorStatusCondition } @@ -198,7 +200,8 @@ func (optr *Operator) SetArchitecture(architecture string) { optr.architecture = architecture } -func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, transport *http.Transport, upstream, arch, channel, version string) (configv1.Release, []configv1.Release, []configv1.ConditionalUpdate, configv1.ClusterOperatorStatusCondition) { +func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, transport *http.Transport, upstream, arch, channel, version string, conditionRegistry clusterconditions.ConditionRegistry) (configv1.Release, []configv1.Release, []configv1.ConditionalUpdate, configv1.ClusterOperatorStatusCondition) { + var cvoCurrent configv1.Release if len(upstream) == 0 { return cvoCurrent, nil, nil, configv1.ClusterOperatorStatusCondition{ @@ -253,7 +256,7 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran } } - current, updates, conditionalUpdates, err := cincinnati.NewClient(uuid, transport).GetUpdates(ctx, upstreamURI, arch, channel, currentVersion) + current, updates, conditionalUpdates, err := cincinnati.NewClient(uuid, transport, conditionRegistry).GetUpdates(ctx, upstreamURI, arch, channel, currentVersion) if err != nil { klog.V(2).Infof("Upstream server %s could not return available updates: %v", upstream, err) if updateError, ok := err.(*cincinnati.Error); ok { @@ -289,7 +292,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { }) for i, conditionalUpdate := range u.ConditionalUpdates { - if errorCondition := evaluateConditionalUpdate(ctx, &conditionalUpdate); errorCondition != nil { + if errorCondition := evaluateConditionalUpdate(ctx, &conditionalUpdate, u.ConditionRegistry); errorCondition != nil { meta.SetStatusCondition(&conditionalUpdate.Conditions, *errorCondition) u.removeUpdate(ctx, conditionalUpdate.Release.Image) } else { @@ -314,13 +317,13 @@ func (u *availableUpdates) removeUpdate(ctx context.Context, image string) { } } -func evaluateConditionalUpdate(ctx context.Context, conditionalUpdate *configv1.ConditionalUpdate) *metav1.Condition { +func evaluateConditionalUpdate(ctx context.Context, conditionalUpdate *configv1.ConditionalUpdate, conditionRegistry clusterconditions.ConditionRegistry) *metav1.Condition { recommended := &metav1.Condition{ Type: "Recommended", } messages := []string{} for _, risk := range conditionalUpdate.Risks { - if match, err := clusterconditions.Match(ctx, risk.MatchingRules); err != nil { + if match, err := conditionRegistry.Match(ctx, risk.MatchingRules); err != nil { if recommended.Status != metav1.ConditionFalse { recommended.Status = metav1.ConditionUnknown } diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index a70b73eed3..6b19ce1c32 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -8,6 +8,10 @@ import ( "sync" "time" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" + + "github.com/openshift/cluster-version-operator/pkg/clusterconditions" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -129,6 +133,9 @@ type Operator struct { // synchronization upgradeableCheckIntervals upgradeableCheckIntervals + // conditionRegistry is used to evaluate whether a particular condition is risky or not. + conditionRegistry clusterconditions.ConditionRegistry + // verifier, if provided, will be used to check an update before it is executed. // Any error will prevent an update payload from being accessed. verifier verify.Interface @@ -204,6 +211,7 @@ func New( exclude: exclude, requiredFeatureSet: requiredFeatureSet, clusterProfile: clusterProfile, + conditionRegistry: standard.NewConditionRegistry(kubeClient), } cvInformer.Informer().AddEventHandler(optr.clusterVersionEventHandler()) From f9659cc8f680b963455bcf2ec2240f0f561d743b Mon Sep 17 00:00:00 2001 From: David Eads Date: Fri, 31 Mar 2023 10:01:56 -0400 Subject: [PATCH 15/21] connect to thanos using IP We do this so that our host-network pod can use the node's resolv.conf to resolve the internal load balancer name on the pod before DNS pods are available and before the service network is available. The side effect is that the CVO cannot resolve service DNS names. --- pkg/clusterconditions/always/always.go | 2 -- pkg/clusterconditions/promql/promql.go | 33 ++++++++++++++++++---- pkg/clusterconditions/standard/standard.go | 2 +- pkg/cvo/cvo.go | 6 ++-- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/pkg/clusterconditions/always/always.go b/pkg/clusterconditions/always/always.go index 2c61e1549a..ca24f0a3e5 100644 --- a/pkg/clusterconditions/always/always.go +++ b/pkg/clusterconditions/always/always.go @@ -13,8 +13,6 @@ import ( // Always implements a cluster condition that always matches. type Always struct{} -var always = &Always{} - // Valid returns an error if the condition contains any properties // besides 'type'. func (a *Always) Valid(ctx context.Context, condition *configv1.ClusterCondition) error { diff --git a/pkg/clusterconditions/promql/promql.go b/pkg/clusterconditions/promql/promql.go index 5f96ad5119..7a75046e81 100644 --- a/pkg/clusterconditions/promql/promql.go +++ b/pkg/clusterconditions/promql/promql.go @@ -7,6 +7,7 @@ import ( "context" "errors" "fmt" + "net" "time" configv1 "github.com/openshift/api/config/v1" @@ -14,6 +15,8 @@ import ( prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/config" "github.com/prometheus/common/model" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" "github.com/openshift/cluster-version-operator/pkg/clusterconditions/cache" @@ -21,8 +24,7 @@ import ( // PromQL implements a cluster condition that matches based on PromQL. type PromQL struct { - // Address holds the Prometheus query URI. - Address string + kubeClient kubernetes.Interface // HTTPClientConfig holds the client configuration for connecting to the Prometheus service. HTTPClientConfig config.HTTPClientConfig @@ -31,16 +33,18 @@ type PromQL struct { QueryTimeout time.Duration } -func NewPromQL() *cache.Cache { +func NewPromQL(kubeClient kubernetes.Interface) *cache.Cache { return &cache.Cache{ Condition: &PromQL{ + kubeClient: kubeClient, HTTPClientConfig: config.HTTPClientConfig{ Authorization: &config.Authorization{ Type: "Bearer", CredentialsFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", }, TLSConfig: config.TLSConfig{ - CAFile: "/etc/tls/service-ca/service-ca.crt", + CAFile: "/etc/tls/service-ca/service-ca.crt", + // ServerName is used to verify the name of the service we will connect to using IP. ServerName: "thanos-querier.openshift-monitoring.svc.cluster.local", }, }, @@ -52,6 +56,19 @@ func NewPromQL() *cache.Cache { } } +// Address determines the address of the thanos-querier to avoid requiring service DNS resolution. +// We do this so that our host-network pod can use the node's resolv.conf to resolve the internal load balancer name +// on the pod before DNS pods are available and before the service network is available. The side effect is that +// the CVO cannot resolve service DNS names. +func (p *PromQL) Address(ctx context.Context) (string, error) { + svc, err := p.kubeClient.CoreV1().Services("openshift-monitoring").Get(ctx, "thanos-querier", metav1.GetOptions{}) + if err != nil { + return "", err + } + + return fmt.Sprintf("https://%s", net.JoinHostPort(svc.Spec.ClusterIP, "9091")), nil +} + // Valid returns an error if the condition contains any properties // besides 'type' and a valid `promql`. func (p *PromQL) Valid(ctx context.Context, condition *configv1.ClusterCondition) error { @@ -70,7 +87,13 @@ func (p *PromQL) Valid(ctx context.Context, condition *configv1.ClusterCondition // false when the PromQL evaluates to 0, and an error if the PromQL // returns no time series or returns a value besides 0 or 1. func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition) (bool, error) { - clientConfig := api.Config{Address: p.Address} + // Lookup the address every attempt in case the service IP changes. This can happen when the thanos service is + // deleted and recreated. + address, err := p.Address(ctx) + if err != nil { + return false, fmt.Errorf("failure determine thanos IP: %w", err) + } + clientConfig := api.Config{Address: address} if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil { clientConfig.RoundTripper = roundTripper diff --git a/pkg/clusterconditions/standard/standard.go b/pkg/clusterconditions/standard/standard.go index bc3df446b0..97c2814f88 100644 --- a/pkg/clusterconditions/standard/standard.go +++ b/pkg/clusterconditions/standard/standard.go @@ -10,7 +10,7 @@ import ( func NewConditionRegistry(kubeClient kubernetes.Interface) clusterconditions.ConditionRegistry { conditionRegistry := clusterconditions.NewConditionRegistry() conditionRegistry.Register("Always", &always.Always{}) - conditionRegistry.Register("PromQL", promql.NewPromQL()) + conditionRegistry.Register("PromQL", promql.NewPromQL(kubeClient)) return conditionRegistry } diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 6b19ce1c32..2f8e986f06 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -8,10 +8,6 @@ import ( "sync" "time" - "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" - - "github.com/openshift/cluster-version-operator/pkg/clusterconditions" - corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -37,6 +33,8 @@ import ( "github.com/openshift/cluster-version-operator/lib/capability" "github.com/openshift/cluster-version-operator/lib/resourcebuilder" "github.com/openshift/cluster-version-operator/lib/validation" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" cvointernal "github.com/openshift/cluster-version-operator/pkg/cvo/internal" "github.com/openshift/cluster-version-operator/pkg/cvo/internal/dynamicclient" "github.com/openshift/cluster-version-operator/pkg/internal" From d877813be7ee81e805b6b43e0df13fd1a8e27f1c Mon Sep 17 00:00:00 2001 From: John Eckersberg Date: Thu, 23 Feb 2023 11:16:20 -0500 Subject: [PATCH 16/21] OCPBUGS-7419: Trigger new sync round on ClusterOperator Available changes --- pkg/cvo/cvo.go | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 2f8e986f06..f901748e07 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -501,29 +501,64 @@ func (optr *Operator) clusterVersionEventHandler() cache.ResourceEventHandler { func (optr *Operator) clusterOperatorEventHandler() cache.ResourceEventHandler { return cache.ResourceEventHandlerFuncs{ UpdateFunc: func(old, new interface{}) { + if optr.configSync == nil { + return + } + versionName := "operator" - _, oldVersion := clusterOperatorInterfaceVersionOrDie(old, versionName) - newStruct, newVersion := clusterOperatorInterfaceVersionOrDie(new, versionName) - if optr.configSync != nil && oldVersion != newVersion { + oldStruct := clusterOperatorInterfaceStructOrDie(old) + newStruct := clusterOperatorInterfaceStructOrDie(new) + + oldVersion := clusterOperatorVersion(oldStruct, versionName) + newVersion := clusterOperatorVersion(newStruct, versionName) + if oldVersion != newVersion { msg := fmt.Sprintf("Cluster operator %s changed versions[name=%q] from %q to %q", newStruct.ObjectMeta.Name, versionName, oldVersion, newVersion) optr.configSync.NotifyAboutManagedResourceActivity(new, msg) + return } + + for _, cond := range []configv1.ClusterStatusConditionType{ + configv1.OperatorAvailable, + configv1.OperatorDegraded, + } { + oldStatus := clusterOperatorConditionStatus(oldStruct, cond) + newStatus := clusterOperatorConditionStatus(newStruct, cond) + if oldStatus != newStatus { + msg := fmt.Sprintf("Cluster operator %s changed %s from %q to %q", newStruct.ObjectMeta.Name, cond, oldStatus, newStatus) + optr.configSync.NotifyAboutManagedResourceActivity(new, msg) + return + } + } + }, } } -func clusterOperatorInterfaceVersionOrDie(obj interface{}, name string) (*configv1.ClusterOperator, string) { +func clusterOperatorInterfaceStructOrDie(obj interface{}) *configv1.ClusterOperator { co, ok := obj.(*configv1.ClusterOperator) if !ok { panic(fmt.Sprintf("%v is %T, not a ClusterOperator", obj, obj)) } + return co +} + +func clusterOperatorVersion(co *configv1.ClusterOperator, name string) string { for _, version := range co.Status.Versions { if version.Name == name { - return co, version.Version + return version.Version + } + } + return "" +} + +func clusterOperatorConditionStatus(co *configv1.ClusterOperator, condType configv1.ClusterStatusConditionType) configv1.ConditionStatus { + for _, cond := range co.Status.Conditions { + if cond.Type == condType { + return cond.Status } } - return co, "" + return configv1.ConditionUnknown } func (optr *Operator) worker(ctx context.Context, queue workqueue.RateLimitingInterface, syncHandler func(context.Context, string) error) { From ce5026aea2118f8dbe8dc2e5bccb4829a476f421 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 27 Sep 2023 12:06:36 -0700 Subject: [PATCH 17/21] pkg/clusterconditions/cache: Avoid panic on all-fresh-cache evaluation When: * It has been longer than MinBetweenMatches since our last wrapped evaluation, * There is no alternative key stale enough (MinForCondition) to steal the evaluation, and * The currently requested key is not cached yet, e.g. because it was recently declared as a new risk to a cluster-version operator which had recently run all the other relevent risks from the current Cincinnati response. avoid crashing with: $ grep -B1 -A15 'too fresh' previous.log I0927 12:07:55.594222 1 cincinnati.go:114] Using a root CA pool with 0 root CA subjects to request updates from https://raw.githubusercontent.com/shellyyang1989/upgrade-cincy/master/cincy-conditional-edge-invalid-promql.json?arch=amd64&channel=stable-4.15&id=dc628f75-7778-457a-bb69-6a31a243c3a9&version=4.15.0-0.test-2023-09-27-091926-ci-ln-01zw7kk-latest I0927 12:07:55.726463 1 cache.go:118] {"type":"PromQL","promql":{"promql":"0 * group(cluster_version)"}} is the most stale cached cluster-condition match entry, but it is too fresh (last evaluated on 2023-09-27 11:37:25.876804482 +0000 UTC m=+175.082381015). However, we don't have a cached evaluation for {"type":"PromQL","promql":{"promql":"group(cluster_version_available_updates{channel=buggy})"}}, so attempt to evaluate that now. I0927 12:07:55.726602 1 cache.go:129] {"type":"PromQL","promql":{"promql":"0 * group(cluster_version)"}} is stealing this cluster-condition match call for {"type":"PromQL","promql":{"promql":"group(cluster_version_available_updates{channel=buggy})"}}, because its last evaluation completed 30m29.849594461s ago I0927 12:07:55.758573 1 cvo.go:703] Finished syncing available updates "openshift-cluster-version/version" (170.074319ms) E0927 12:07:55.758847 1 runtime.go:79] Observed a panic: "invalid memory address or nil pointer dereference" (runtime error: invalid memory address or nil pointer dereference) goroutine 194 [running]: k8s.io/apimachinery/pkg/util/runtime.logPanic({0x1c4df00?, 0x32abc60}) /go/src/github.com/openshift/cluster-version-operator/vendor/k8s.io/apimachinery/pkg/util/runtime/runtime.go:75 +0x99 k8s.io/apimachinery/pkg/util/runtime.HandleCrash({0x0, 0x0, 0xc001489d40?}) /go/src/github.com/openshift/cluster-version-operator/vendor/k8s.io/apimachinery/pkg/util/runtime/runtime.go:49 +0x75 panic({0x1c4df00, 0x32abc60}) /usr/lib/golang/src/runtime/panic.go:884 +0x213 github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql.(*PromQL).Match(0xc0004860e0, {0x220ded8, 0xc00041e550}, 0x0) /go/src/github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql/promql.go:134 +0x419 github.com/openshift/cluster-version-operator/pkg/clusterconditions/cache.(*Cache).Match(0xc0002d3ae0, {0x220ded8, 0xc00041e550}, 0xc0033948d0) /go/src/github.com/openshift/cluster-version-operator/pkg/clusterconditions/cache/cache.go:132 +0x982 github.com/openshift/cluster-version-operator/pkg/clusterconditions.(*conditionRegistry).Match(0xc000016760, {0x220ded8, 0xc00041e550}, {0xc0033948a0, 0x1, 0x0?}) --- pkg/clusterconditions/cache/cache.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/clusterconditions/cache/cache.go b/pkg/clusterconditions/cache/cache.go index ec772d2b24..e19a54d617 100644 --- a/pkg/clusterconditions/cache/cache.go +++ b/pkg/clusterconditions/cache/cache.go @@ -117,6 +117,8 @@ func (c *Cache) Match(ctx context.Context, condition *configv1.ClusterCondition) detail = fmt.Sprintf(" (last evaluated on %s)", thiefResult.When) } klog.V(2).Infof("%s is the most stale cached cluster-condition match entry, but it is too fresh%s. However, we don't have a cached evaluation for %s, so attempt to evaluate that now.", thiefKey, detail, key) + thiefKey = key + targetCondition = condition } // if we ended up stealing this Match call, log that, to make contention more clear From fa845474cdfb2c8cf828054a174d1ca6f022c27b Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Mon, 18 Sep 2023 22:43:28 +0200 Subject: [PATCH 18/21] availableupdates: do not reset lastTransitionTime on unchanged status (#964) * availableupdates: do not reset lastTransitionTime on unchanged status The code in `evaluateConditionalUpdates` correctly uses `SetStatusCondition` to set conditions, which only updates the `LastTransitionTime` field when `Status` differs between the original and updated state. Previously though, the original state always contained empty conditions, because conditional updates are always obtained from OSUS and the fresh structure was never updated with existing conditions from the in-cluster status. * review: use existing mock condition instead of new code * review: use real queue instead of a mock --- pkg/clusterconditions/mock/mock.go | 4 +- pkg/cvo/availableupdates.go | 35 +++-- pkg/cvo/availableupdates_test.go | 226 +++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 14 deletions(-) create mode 100644 pkg/cvo/availableupdates_test.go diff --git a/pkg/clusterconditions/mock/mock.go b/pkg/clusterconditions/mock/mock.go index bede44d735..47c4945c14 100644 --- a/pkg/clusterconditions/mock/mock.go +++ b/pkg/clusterconditions/mock/mock.go @@ -44,7 +44,7 @@ type Mock struct { } // Valid returns an error popped from ValidQueue. -func (m *Mock) Valid(ctx context.Context, condition *configv1.ClusterCondition) error { +func (m *Mock) Valid(_ context.Context, condition *configv1.ClusterCondition) error { m.Calls = append(m.Calls, Call{ When: time.Now(), Method: "Valid", @@ -61,7 +61,7 @@ func (m *Mock) Valid(ctx context.Context, condition *configv1.ClusterCondition) } // Match returns an error popped from MatchQueue. -func (m *Mock) Match(ctx context.Context, condition *configv1.ClusterCondition) (bool, error) { +func (m *Mock) Match(_ context.Context, condition *configv1.ClusterCondition) (bool, error) { m.Calls = append(m.Calls, Call{ When: time.Now(), Method: "Match", diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index cdbd01d422..3f88c577db 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -40,21 +40,20 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 arch := optr.getArchitecture() // updates are only checked at most once per minimumUpdateCheckInterval or if the generation changes - u := optr.getAvailableUpdates() - if u == nil { + optrAvailableUpdates := optr.getAvailableUpdates() + if optrAvailableUpdates == nil { klog.V(2).Info("First attempt to retrieve available updates") - } else if !u.RecentlyChanged(optr.minimumUpdateCheckInterval) { - klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, u.LastAttempt) - } else if channel != u.Channel { - klog.V(2).Infof("Retrieving available updates again, because the channel has changed from %q to %q", u.Channel, channel) - } else if arch != u.Architecture { - klog.V(2).Infof("Retrieving available updates again, because the architecture has changed from %q to %q", - u.Architecture, arch) - } else if upstream == u.Upstream || (upstream == optr.defaultUpstreamServer && u.Upstream == "") { - klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, u.LastAttempt) + } else if !optrAvailableUpdates.RecentlyChanged(optr.minimumUpdateCheckInterval) { + klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) + } else if channel != optrAvailableUpdates.Channel { + klog.V(2).Infof("Retrieving available updates again, because the channel has changed from %q to %q", optrAvailableUpdates.Channel, channel) + } else if arch != optrAvailableUpdates.Architecture { + klog.V(2).Infof("Retrieving available updates again, because the architecture has changed from %q to %q", optrAvailableUpdates.Architecture, arch) + } else if upstream == optrAvailableUpdates.Upstream || (upstream == optr.defaultUpstreamServer && optrAvailableUpdates.Upstream == "") { + klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) return nil } else { - klog.V(2).Infof("Retrieving available updates again, because the upstream has changed from %q to %q", u.Upstream, config.Spec.Upstream) + klog.V(2).Infof("Retrieving available updates again, because the upstream has changed from %q to %q", optrAvailableUpdates.Upstream, config.Spec.Upstream) } transport, err := optr.getTransport() @@ -64,6 +63,18 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 current, updates, conditionalUpdates, condition := calculateAvailableUpdatesStatus(ctx, string(config.Spec.ClusterID), transport, upstream, arch, channel, optr.release.Version, optr.conditionRegistry) + // Populate conditions on conditional updates from operator state + if optrAvailableUpdates != nil { + for i := range optrAvailableUpdates.ConditionalUpdates { + for j := range conditionalUpdates { + if optrAvailableUpdates.ConditionalUpdates[i].Release.Image == conditionalUpdates[j].Release.Image { + conditionalUpdates[j].Conditions = optrAvailableUpdates.ConditionalUpdates[i].Conditions + break + } + } + } + } + if usedDefaultUpstream { upstream = "" } diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go new file mode 100644 index 0000000000..117ec21616 --- /dev/null +++ b/pkg/cvo/availableupdates_test.go @@ -0,0 +1,226 @@ +package cvo + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/util/workqueue" + + configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/mock" +) + +// notFoundProxyLister is a stub for ProxyLister +type notFoundProxyLister struct{} + +func (n notFoundProxyLister) List(labels.Selector) ([]*configv1.Proxy, error) { + return nil, nil +} + +func (n notFoundProxyLister) Get(name string) (*configv1.Proxy, error) { + return nil, errors.NewNotFound(schema.GroupResource{Group: configv1.GroupName, Resource: "proxy"}, name) +} + +type notFoundConfigMapLister struct{} + +func (n notFoundConfigMapLister) List(labels.Selector) ([]*corev1.ConfigMap, error) { + return nil, nil +} + +func (n notFoundConfigMapLister) Get(name string) (*corev1.ConfigMap, error) { + return nil, errors.NewNotFound(schema.GroupResource{Group: "", Resource: "configmap"}, name) +} + +// osusWithSingleConditionalEdge helper returns: +// 1. mock osus server that serves a simple conditional path between two versions. +// 2. mock condition that always evaluates to match +// 3. expected []ConditionalUpdate data after evaluation of the data served by mock osus server +// (assuming the mock condition (2) was used) +// 4. current version of the cluster that would issue the request to the mock osus server +func osusWithSingleConditionalEdge() (*httptest.Server, clusterconditions.Condition, []configv1.ConditionalUpdate, string) { + from := "4.5.5" + to := "4.5.6" + osus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, `{ + "nodes": [{"version": "%s", "payload": "payload/%s"}, {"version": "%s", "payload": "payload/%s"}], + "conditionalEdges": [ + { + "edges": [{"from": "%s", "to": "%s"}], + "risks": [ + { + "url": "https://example.com/%s", + "name": "FourFiveSix", + "message": "Four Five Five is just fine", + "matchingRules": [{"type": "PromQL", "promql": { "promql": "this is a query"}}] + } + ] + } + ] +} +`, from, from, to, to, from, to, to) + })) + + updates := []configv1.ConditionalUpdate{ + { + Release: configv1.Release{Version: to, Image: "payload/" + to}, + Risks: []configv1.ConditionalUpdateRisk{ + { + URL: "https://example.com/" + to, + Name: "FourFiveSix", + Message: "Four Five Five is just fine", + MatchingRules: []configv1.ClusterCondition{ + { + Type: "PromQL", + PromQL: &configv1.PromQLClusterCondition{PromQL: "this is a query"}, + }, + }, + }, + }, + Conditions: []metav1.Condition{ + { + Type: "Recommended", + Status: metav1.ConditionFalse, + Reason: "FourFiveSix", + Message: "Four Five Five is just fine https://example.com/" + to, + }, + }, + }, + } + mockPromql := &mock.Mock{ + ValidQueue: []error{nil}, + MatchQueue: []mock.MatchResult{{Match: true, Error: nil}}, + } + + return osus, mockPromql, updates, from +} + +func newOperator(url, version string, promqlMock clusterconditions.Condition) (*availableUpdates, *Operator) { + currentRelease := configv1.Release{Version: version, Image: "payload/" + version} + registry := clusterconditions.NewConditionRegistry() + registry.Register("Always", &always.Always{}) + registry.Register("PromQL", promqlMock) + operator := &Operator{ + defaultUpstreamServer: url, + architecture: "amd64", + proxyLister: notFoundProxyLister{}, + cmConfigManagedLister: notFoundConfigMapLister{}, + conditionRegistry: registry, + queue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), + release: currentRelease, + } + availableUpdates := &availableUpdates{ + Architecture: "amd64", + Current: configv1.Release{Version: version, Image: "payload/" + version}, + } + return availableUpdates, operator +} + +var cvFixture = &configv1.ClusterVersion{ + Spec: configv1.ClusterVersionSpec{ + ClusterID: "897f0a22-33ca-4106-a2c4-29b75250255a", + Channel: "channel", + }, +} + +var availableUpdatesCmpOpts = []cmp.Option{ + cmpopts.IgnoreTypes(time.Time{}), + cmpopts.IgnoreInterfaces(struct { + clusterconditions.ConditionRegistry + }{}), +} + +func TestSyncAvailableUpdates(t *testing.T) { + fakeOsus, mockPromql, expectedConditionalUpdates, version := osusWithSingleConditionalEdge() + defer fakeOsus.Close() + expectedAvailableUpdates, optr := newOperator(fakeOsus.URL, version, mockPromql) + expectedAvailableUpdates.ConditionalUpdates = expectedConditionalUpdates + expectedAvailableUpdates.Channel = cvFixture.Spec.Channel + expectedAvailableUpdates.Condition = configv1.ClusterOperatorStatusCondition{ + Type: configv1.RetrievedUpdates, + Status: configv1.ConditionTrue, + } + + err := optr.syncAvailableUpdates(context.Background(), cvFixture) + + if err != nil { + t.Fatalf("syncAvailableUpdates() unexpected error: %v", err) + } + if diff := cmp.Diff(expectedAvailableUpdates, optr.availableUpdates, availableUpdatesCmpOpts...); diff != "" { + t.Fatalf("available updates differ from expected:\n%s", diff) + } +} + +func TestSyncAvailableUpdates_ConditionalUpdateRecommendedConditions(t *testing.T) { + testCases := []struct { + name string + modifyOriginalState func(condition *metav1.Condition) + expectTimeChange bool + }{ + { + name: "lastTransitionTime is not updated when nothing changes", + modifyOriginalState: func(condition *metav1.Condition) {}, + }, + { + name: "lastTransitionTime is not updated when changed but status is identical", + modifyOriginalState: func(condition *metav1.Condition) { + condition.Reason = "OldReason" + condition.Message = "This message should be changed to something else" + }, + }, + { + name: "lastTransitionTime is updated when status changes", + modifyOriginalState: func(condition *metav1.Condition) { + condition.Status = metav1.ConditionUnknown + }, + expectTimeChange: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fakeOsus, mockPromql, conditionalUpdates, version := osusWithSingleConditionalEdge() + defer fakeOsus.Close() + availableUpdates, optr := newOperator(fakeOsus.URL, version, mockPromql) + optr.availableUpdates = availableUpdates + optr.availableUpdates.ConditionalUpdates = conditionalUpdates + expectedConditions := []metav1.Condition{{}} + conditionalUpdates[0].Conditions[0].DeepCopyInto(&expectedConditions[0]) + tc.modifyOriginalState(&optr.availableUpdates.ConditionalUpdates[0].Conditions[0]) + + err := optr.syncAvailableUpdates(context.Background(), cvFixture) + + if err != nil { + t.Fatalf("syncAvailableUpdates() unexpected error: %v", err) + } + if optr.availableUpdates == nil || len(optr.availableUpdates.ConditionalUpdates) == 0 { + t.Fatalf("syncAvailableUpdates() did not properly set available updates") + } + if diff := cmp.Diff(expectedConditions, optr.availableUpdates.ConditionalUpdates[0].Conditions, cmpopts.IgnoreTypes(time.Time{})); diff != "" { + t.Errorf("conditions on conditional updates differ from expected:\n%s", diff) + } + timeBefore := expectedConditions[0].LastTransitionTime + timeAfter := optr.availableUpdates.ConditionalUpdates[0].Conditions[0].LastTransitionTime + + if tc.expectTimeChange && timeBefore == timeAfter { + t.Errorf("lastTransitionTime was not updated as expected: before=%s after=%s", timeBefore, timeAfter) + } + if !tc.expectTimeChange && timeBefore != timeAfter { + t.Errorf("lastTransitionTime was updated but was not expected to: before=%s after=%s", timeBefore, timeAfter) + } + }) + } +} From 202b5c63ffcd420775b42be3cc45ce372fe099a0 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 11 Sep 2023 14:08:45 -0700 Subject: [PATCH 19/21] pkg/cvo/availableupdates: Return a copy in getAvailableUpdates The function had returned the original pointer since it landed in db150e6db4 (cvo: Perform status updates in a single thread, 2018-11-03, #45). But locking the operator structure to return a pointer reference is a bit risky, because after the lock is released you're still holding a pointer into that data, but lack easy access to the lock to guard against simultaneous access. For example, you could have setAvailableUpdates updating the structure, while simultaneously operatorMetrics.Collect, Operator.syncStatus, or Operator.mergeReleaseMetadata is looking at their pointer reference to the old data. There wasn't actually much exposure, because writes all happened to flow through setAvailableUpdates, and setAvailableUpdates's only changes were: * Bumping the u.LastSyncOrConfigChange Time. * Replacing the availableUpdates pointer with a new pointer. and neither of those should significantly disrupt any of the consumers. But switching to a copy doesn't cost much resource wise, and it protects us from a number of possible ways that this could break in the future if setAvailableUpdates does less full-pointer-replacement or one of the consumers starts to care about LastSyncOrConfigChange reliably lining up with the rest of the availableUpdates content. It does mean we need to update the copy logic as we add new properties to the structure, but we'd need to do that even if we used deepcopy-gen or similar to automate the copy generation. --- pkg/cvo/availableupdates.go | 55 ++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 3f88c577db..d33ca2c5f8 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -43,6 +43,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 optrAvailableUpdates := optr.getAvailableUpdates() if optrAvailableUpdates == nil { klog.V(2).Info("First attempt to retrieve available updates") + optrAvailableUpdates = &availableUpdates{} } else if !optrAvailableUpdates.RecentlyChanged(optr.minimumUpdateCheckInterval) { klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) } else if channel != optrAvailableUpdates.Channel { @@ -79,19 +80,17 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 upstream = "" } - au := &availableUpdates{ - Upstream: upstream, - Channel: config.Spec.Channel, - Architecture: arch, - Current: current, - Updates: updates, - ConditionalUpdates: conditionalUpdates, - ConditionRegistry: optr.conditionRegistry, - Condition: condition, - } + optrAvailableUpdates.Upstream = upstream + optrAvailableUpdates.Channel = channel + optrAvailableUpdates.Architecture = arch + optrAvailableUpdates.Current = current + optrAvailableUpdates.Updates = updates + optrAvailableUpdates.ConditionalUpdates = conditionalUpdates + optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry + optrAvailableUpdates.Condition = condition - au.evaluateConditionalUpdates(ctx) - optr.setAvailableUpdates(au) + optrAvailableUpdates.evaluateConditionalUpdates(ctx) + optr.setAvailableUpdates(optrAvailableUpdates) // requeue optr.queue.Add(optr.queueKey()) @@ -194,7 +193,37 @@ func (optr *Operator) setAvailableUpdates(u *availableUpdates) { func (optr *Operator) getAvailableUpdates() *availableUpdates { optr.statusLock.Lock() defer optr.statusLock.Unlock() - return optr.availableUpdates + + if optr.availableUpdates == nil { + return nil + } + + u := &availableUpdates{ + Upstream: optr.availableUpdates.Upstream, + Channel: optr.availableUpdates.Channel, + Architecture: optr.availableUpdates.Architecture, + LastAttempt: optr.availableUpdates.LastAttempt, + LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange, + Current: *optr.availableUpdates.Current.DeepCopy(), + ConditionRegistry: optr.availableUpdates.ConditionRegistry, // intentionally not a copy, to preserve cache state + Condition: optr.availableUpdates.Condition, + } + + if optr.availableUpdates.Updates != nil { + u.Updates = make([]configv1.Release, 0, len(optr.availableUpdates.Updates)) + for _, update := range optr.availableUpdates.Updates { + u.Updates = append(u.Updates, *update.DeepCopy()) + } + } + + if optr.availableUpdates.ConditionalUpdates != nil { + u.ConditionalUpdates = make([]configv1.ConditionalUpdate, 0, len(optr.availableUpdates.ConditionalUpdates)) + for _, conditionalUpdate := range optr.availableUpdates.ConditionalUpdates { + u.ConditionalUpdates = append(u.ConditionalUpdates, *conditionalUpdate.DeepCopy()) + } + } + + return u } // getArchitecture returns the currently determined cluster architecture. From 260404b09bc3b761dd47238823afea6acf546d4f Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 18 Sep 2023 15:58:37 -0700 Subject: [PATCH 20/21] pkg/cvo/availableupdates: Requeue risk evaluation on failure Instead of waiting for the next round of evaluation, which might take minutes. For example, in 4.14.0-rc.1 testing [1]: $ curl -s https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/origin-ci-test/logs/periodic-ci-openshift-release-master-nightly-4.14-e2e-aws-ovn-serial/1702743868887273472/artifacts/e2e-aws-ovn-serial/gather-extra/artifacts/pods/openshift-cluster-version_cluster-version-operator-78644f4679-q8sdm_cluster-version-operator.log | grep availableupdate I0915 18:21:24.184272 1 availableupdates.go:50] First attempt to retrieve available updates I0915 18:21:24.845512 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:21:24Z, will try later. I0915 18:21:39.836566 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:21:24Z, will try later. I0915 18:21:39.843398 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:21:24Z, will try later. I0915 18:21:54.835464 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:21:24Z, will try later. I0915 18:23:16.769850 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:21:24Z, will try later. I0915 18:23:16.784421 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:21:24Z, will try later. I0915 18:23:39.842269 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:21:24Z, will try later. I0915 18:23:39.862590 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:21:24Z, will try later. I0915 18:24:09.837669 1 availableupdates.go:52] Retrieving available updates again, because more than 2m28.489200644s has elapsed since 2023-09-15T18:21:24Z I0915 18:24:24.843569 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:24:09Z, will try later. I0915 18:25:24.839869 1 availableupdates.go:58] Available updates were recently retrieved, with less than 2m28.489200644s elapsed since 2023-09-15T18:24:09Z, will try later. ... I0915 20:26:07.109093 1 availableupdates.go:52] Retrieving available updates again, because more than 2m28.489200644s has elapsed since 2023-09-15T20:22:23Z I0915 20:29:50.769739 1 availableupdates.go:52] Retrieving available updates again, because more than 2m28.489200644s has elapsed since 2023-09-15T20:26:07Z I0915 20:33:34.432215 1 availableupdates.go:52] Retrieving available updates again, because more than 2m28.489200644s has elapsed since 2023-09-15T20:29:50Z I0915 20:37:18.093261 1 availableupdates.go:52] Retrieving available updates again, because more than 2m28.489200644s has elapsed since 2023-09-15T20:33:34Z I'm not entirely clear on what the triggers were there, with 3m44s between those final entries. Operator.Run sets up: wait.UntilWithContext(runContext, func(runContext context.Context) { optr.worker(runContext, optr.availableUpdatesQueue, optr.availableUpdatesSync) }, time.Second) and [2] docs UntilWithContext: UntilWithContext loops until context is done, running f every period. UntilWithContext is syntactic sugar on top of JitterUntilWithContext with zero jitter factor and with sliding = true (which means the timer for period starts after the f completes). So that should be waking up, draining the queue, sleeping a second, waking back up, draining the queue again, and on forever. Perhaps we are just backing off to the slowest DefaultControllerRateLimiter period [3], but I expect processNextWorkItem's calling handleErr is calling Forget on the queue, because I don't see any of its error-branch logging: $ curl -s https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/origin-ci-test/logs/periodic-ci-openshift-release-master-nightly-4.14-e2e-aws-ovn-serial/1702743868887273472/artifacts/e2e-aws-ovn-serial/gather-extra/artifacts/pods/openshift-cluster-version_cluster-version-operator-78644f4679-q8sdm_cluster-version-operator.log | grep 'Error handling\|out of the queue' ...no hits... That suggests nothing is slowing down our queue processing from once-per-second (plus evaluation time). But what's feeding the queue items to process? There only Add calls seem to be in clusterVersionEventHandler, but checking audit logs: $ curl -s https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/origin-ci-test/logs/periodic-ci-openshift-release-master-nightly-4.14-e2e-aws-ovn-serial/1702743868887273472/artifacts/e2e-aws-ovn-serial/gather-audit-logs/artifacts/audit-logs.tar | tar -xz --strip-components=2 $ zgrep -h clusterversion kube-apiserver/*audit*.log.gz | jq -r 'select(.verb != "get" and .verb != "list" and .verb != "watch") | .stageTimestamp + " " + (.responseStatus.code | tostring) + " " + .verb + " " + .objectRef.subresource + " " + .user.username' | sort ... 2023-09-15T18:26:24.841812Z 200 update status system:serviceaccount:openshift-cluster-version:default 2023-09-15T18:26:24.858507Z 200 update status system:serviceaccount:openshift-cluster-version:default 2023-09-15T18:29:39.835307Z 200 update status system:serviceaccount:openshift-cluster-version:default 2023-09-15T18:37:39.836698Z 200 update status system:serviceaccount:openshift-cluster-version:default which are all hours before these 20:26 and similar update retrievals. I suspect this due to resyncPeriod(o.ResyncInterval) being passed to NewFilteredSharedInformerFactory when generating the ClusterVersion informer, putting a lower bound on the UpdateFunc event-handler frequency. My goal is to set the stage for faster cache-warming after receiving a batch of new PromQL update risks, as described in 530a5092a6 (pkg/cvo/availableupdates: Prioritize conditional risks for largest target version, 2023-03-06, #909). I still have not adjusted the caching logic, so at the moment, it only gives us faster updates on the "that PromQL is still throttled" loop. The AddAfter avoids hot-looping on: 1. Can I evaluate the risks? 2. No? Requeue and return to 1 right now. and instead gives us: 1. Can I evaluate the risks? 2. No? Requeue and return to 1 around a second from know. The new addUpdate avoids injecting the same Recommended=True target into availableUpdates multiple times while trying to evaluate another conditional update, now that we have the !needFreshFetch case, where we recycle the previous structure data without the fresh Cincinnati fetch to clear earlier additions. Without the addUpdate pivot, we get [5]: status: availableUpdates: - image: registry.ci.openshift.org/ocp/release@sha256:e385a786f122c6c0e8848ecb9901f510676438f17af8a5c4c206807a9bc0bf28 version: 4.15.0-0.nightly-2023-10-19-222222 - image: registry.ci.openshift.org/ocp/release@sha256:e385a786f122c6c0e8848ecb9901f510676438f17af8a5c4c206807a9bc0bf28 version: 4.15.0-0.nightly-2023-10-19-222222 - image: registry.ci.openshift.org/ocp/release@sha256:e385a786f122c6c0e8848ecb9901f510676438f17af8a5c4c206807a9bc0bf28 version: 4.15.0-0.nightly-2023-10-19-222222 ... conditionalUpdates: - conditions: - lastTransitionTime: "2023-09-21T09:29:30Z" message: The update is recommended, because none of the conditional update risks apply to this cluster. reason: AsExpected status: "True" type: Recommended release: image: registry.ci.openshift.org/ocp/release@sha256:e385a786f122c6c0e8848ecb9901f510676438f17af8a5c4c206807a9bc0bf28 version: 4.15.0-0.nightly-2023-10-19-222222 risks: - matchingRules: - promql: promql: |- cluster_infrastructure_provider{type=~"nonexist"} or 0 * cluster_infrastructure_provider type: PromQL message: Clusters on nonexist provider, this imaginary bug can happen. name: SomeInfrastructureThing url: https://bug.example.com/c ... - conditions: - lastTransitionTime: "2023-09-21T09:29:31Z" message: |- On clusters on default invoker user, this imaginary bug can happen. https://bug.example.com/a Could not evaluate exposure to update risk SomeChannelThing (evaluation is throttled until 09:29:32Z) SomeChannelThing description: On clusters with the channel set to 'buggy', this imaginary bug can happen. SomeChannelThing URL: https://bug.example.com/b reason: MultipleReasons status: "False" type: Recommended release: image: registry.ci.openshift.org/ocp/release@sha256:66c753e8b75d172f2a3f7ba13363383a76ecbc7ecdc00f3a423bef4ea8560405 version: 4.15.0-0.nightly-2023-10-17-000000 risks: - matchingRules: - promql: promql: cluster_installer type: PromQL message: On clusters on default invoker user, this imaginary bug can happen. name: SomeInvokerThing url: https://bug.example.com/a - matchingRules: - promql: promql: |- group(cluster_version_available_updates{channel="buggy"}) or 0 * group(cluster_version_available_updates{channel!="buggy"}) type: PromQL message: On clusters with the channel set to 'buggy', this imaginary bug can happen. name: SomeChannelThing url: https://bug.example.com/b lasting until the next Cincinnati fetch cleared out the availableUpdates redundancy. [1]: https://prow.ci.openshift.org/view/gs/origin-ci-test/logs/periodic-ci-openshift-release-master-nightly-4.14-e2e-aws-ovn-serial/1702743868887273472 [2]: https://pkg.go.dev/k8s.io/apimachinery/pkg/util/wait#UntilWithContext [3]: https://pkg.go.dev/k8s.io/client-go/util/workqueue#DefaultControllerRateLimiter [4]: https://github.com/kubernetes/client-go/blob/v0.28.2/util/workqueue/default_rate_limiters.go#L39 [5]: https://github.com/openshift/cluster-version-operator/pull/939#issuecomment-1729229326 --- pkg/cvo/availableupdates.go | 91 ++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 26 deletions(-) diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index d33ca2c5f8..36ff71b7d8 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -41,6 +41,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 // updates are only checked at most once per minimumUpdateCheckInterval or if the generation changes optrAvailableUpdates := optr.getAvailableUpdates() + needFreshFetch := true if optrAvailableUpdates == nil { klog.V(2).Info("First attempt to retrieve available updates") optrAvailableUpdates = &availableUpdates{} @@ -51,21 +52,35 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 } else if arch != optrAvailableUpdates.Architecture { klog.V(2).Infof("Retrieving available updates again, because the architecture has changed from %q to %q", optrAvailableUpdates.Architecture, arch) } else if upstream == optrAvailableUpdates.Upstream || (upstream == optr.defaultUpstreamServer && optrAvailableUpdates.Upstream == "") { - klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) - return nil + needsConditionalUpdateEval := false + for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates { + if recommended := meta.FindStatusCondition(conditionalUpdate.Conditions, "Recommended"); recommended == nil { + needsConditionalUpdateEval = true + break + } else if recommended.Status != metav1.ConditionTrue && recommended.Status != metav1.ConditionFalse { + needsConditionalUpdateEval = true + break + } + } + if !needsConditionalUpdateEval { + klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) + return nil + } + needFreshFetch = false } else { klog.V(2).Infof("Retrieving available updates again, because the upstream has changed from %q to %q", optrAvailableUpdates.Upstream, config.Spec.Upstream) } - transport, err := optr.getTransport() - if err != nil { - return err - } + if needFreshFetch { + transport, err := optr.getTransport() + if err != nil { + return err + } - current, updates, conditionalUpdates, condition := calculateAvailableUpdatesStatus(ctx, string(config.Spec.ClusterID), transport, upstream, arch, channel, optr.release.Version, optr.conditionRegistry) + current, updates, conditionalUpdates, condition := calculateAvailableUpdatesStatus(ctx, string(config.Spec.ClusterID), + transport, upstream, arch, channel, optr.release.Version, optr.conditionRegistry) - // Populate conditions on conditional updates from operator state - if optrAvailableUpdates != nil { + // Populate conditions on conditional updates from operator state for i := range optrAvailableUpdates.ConditionalUpdates { for j := range conditionalUpdates { if optrAvailableUpdates.ConditionalUpdates[i].Release.Image == conditionalUpdates[j].Release.Image { @@ -74,26 +89,40 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 } } } - } - if usedDefaultUpstream { - upstream = "" - } + if usedDefaultUpstream { + upstream = "" + } - optrAvailableUpdates.Upstream = upstream - optrAvailableUpdates.Channel = channel - optrAvailableUpdates.Architecture = arch - optrAvailableUpdates.Current = current - optrAvailableUpdates.Updates = updates - optrAvailableUpdates.ConditionalUpdates = conditionalUpdates - optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry - optrAvailableUpdates.Condition = condition + optrAvailableUpdates.Upstream = upstream + optrAvailableUpdates.Channel = channel + optrAvailableUpdates.Architecture = arch + optrAvailableUpdates.Current = current + optrAvailableUpdates.Updates = updates + optrAvailableUpdates.ConditionalUpdates = conditionalUpdates + optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry + optrAvailableUpdates.Condition = condition + } optrAvailableUpdates.evaluateConditionalUpdates(ctx) + + queueKey := optr.queueKey() + for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates { + if recommended := meta.FindStatusCondition(conditionalUpdate.Conditions, "Recommended"); recommended == nil { + klog.Warningf("Requeue available-update evaluation, because %q lacks a Recommended condition", conditionalUpdate.Release.Version) + optr.availableUpdatesQueue.AddAfter(queueKey, time.Second) + break + } else if recommended.Status != metav1.ConditionTrue && recommended.Status != metav1.ConditionFalse { + klog.V(2).Infof("Requeue available-update evaluation, because %q is %s=%s: %s: %s", conditionalUpdate.Release.Version, recommended.Type, recommended.Status, recommended.Reason, recommended.Message) + optr.availableUpdatesQueue.AddAfter(queueKey, time.Second) + break + } + } + optr.setAvailableUpdates(optrAvailableUpdates) - // requeue - optr.queue.Add(optr.queueKey()) + // queue optr.sync() to update ClusterVersion status + optr.queue.Add(queueKey) return nil } @@ -334,7 +363,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { for i, conditionalUpdate := range u.ConditionalUpdates { if errorCondition := evaluateConditionalUpdate(ctx, &conditionalUpdate, u.ConditionRegistry); errorCondition != nil { meta.SetStatusCondition(&conditionalUpdate.Conditions, *errorCondition) - u.removeUpdate(ctx, conditionalUpdate.Release.Image) + u.removeUpdate(conditionalUpdate.Release.Image) } else { meta.SetStatusCondition(&conditionalUpdate.Conditions, metav1.Condition{ Type: "Recommended", @@ -343,13 +372,23 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { Reason: "AsExpected", Message: "The update is recommended, because none of the conditional update risks apply to this cluster.", }) - u.Updates = append(u.Updates, conditionalUpdate.Release) + u.addUpdate(conditionalUpdate.Release) } u.ConditionalUpdates[i].Conditions = conditionalUpdate.Conditions } } -func (u *availableUpdates) removeUpdate(ctx context.Context, image string) { +func (u *availableUpdates) addUpdate(release configv1.Release) { + for _, update := range u.Updates { + if update.Image == release.Image { + return + } + } + + u.Updates = append(u.Updates, release) +} + +func (u *availableUpdates) removeUpdate(image string) { for i, update := range u.Updates { if update.Image == image { u.Updates = append(u.Updates[:i], u.Updates[i+1:]...) From db85bf15492ff26d8f6b4d4c71ec3752036e20b4 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 18 Sep 2023 22:35:23 -0700 Subject: [PATCH 21/21] pkg/clusterconditions/promql: Reduce MinBetweenMatches to 1s 530a5092a6 (pkg/cvo/availableupdates: Prioritize conditional risks for largest target version, 2023-03-06, #909) prioritized the order in which risks were evaluated. But we were still waiting 10 minutes between different PromQL evaluations while evaluating conditional update risks. The original 10m requirement is from the enhancement [1], and was implemented in ca186eda34 (pkg/clusterconditions/cache: Add a cache wrapper for client-side throttling, 2021-11-10, #663). But discussing with Lala, Scott, and Ben, we feel like the addressing the demonstrated user experience need of low-latency risk evaluation [2] is worth reducing the throttling to 1s per expression evaluation. We still have MinForCondition set to an hour, so with this commit, a cluster-version operator evaluating three risks will move from a timeline like: 1. 0s, hear about risks that depend on PromQL A, B, and C. Evaluate A for the first time. 2. 10m, evaluate B for the first time (MinBetweenMatches after 1). 3. 20m, evaluate C for the first time (MinBetweenMatches after 2). 4. 1h, evaluate A again (MinForCondition after 1, also well past MinBetweenMatches after 3). 5. 1h10m, evaluate B again (MinForCondition after 2 and MinBetweenMatches after 4). 6. 1h20m, evaluate C again (MinForCondition after 3 and MinBetweenMatches after 5). 7. 2h, evaluate A again (MinForCondition after 4, also well past MinBetweenMatches after 6). 8. 2h10m, evaluate B again (MinForCondition after 5 and MinBetweenMatches after 7). 9. 2h20m, evaluate C again (MinForCondition after 6 and MinBetweenMatches after 8). to a timeline like: 1. 0s, hear about risks that depend on PromQL A, B, and C. Evaluate A for the first time. 2. 1s, evaluate B for the first time (MinBetweenMatches after 1). 3. 2s, evaluate C for the first time (MinBetweenMatches after 2). 4. 1h, evaluate A again (MinForCondition after 1, also well past MinBetweenMatches after 3). 5. 1h1s, evaluate B again (MinForCondition after 2 and MinBetweenMatches after 4). 6. 1h2s, evaluate C again (MinForCondition after 3 and MinBetweenMatches after 5). 7. 2h, evaluate A again (MinForCondition after 4, also well past MinBetweenMatches after 6). 8. 2h1s, evaluate B again (MinForCondition after 5 and MinBetweenMatches after 7). 9. 2h2s, evaluate C again (MinForCondition after 6 and MinBetweenMatches after 8). We could deliver faster cache warming while preserving spaced out refresh evaluation by splitting MinBetweenMatches into a 1s MinBetweenMatchesInitial and 10m MinBetweenMatchesWhenCached, which would produce timelines like: 1. 0s, hear about risks that depend on PromQL A, B, and C. Evaluate A for the first time. 2. 1s, evaluate B for the first time (MinBetweenMatchesInitial after 1). 3. 2s, evaluate C for the first time (MinBetweenMatchesInitial after 2). 4. 1h, evaluate A again (MinForCondition after 1, also well past MinBetweenMatchesWhenCached after 3). 5. 1h10m, evaluate B again (MinForCondition after 2 and MinBetweenMatchesWhenCached after 4). 6. 1h20m, evaluate C again (MinForCondition after 3 and MinBetweenMatchesWhenCached after 5). 7. 2h, evaluate A again (MinForCondition after 4, also well past MinBetweenMatchesWhenCached after 6). 8. 2h10m, evaluate B again (MinForCondition after 5 and MinBetweenMatchesWhenCached after 7). 9. 2h20m, evaluate C again (MinForCondition after 6 and MinBetweenMatchesWhenCached after 8). but again discussing with Lala, Scott, and Ben, the code complexity to deliver that distinction does not seem to be worth thet protection it delivers to the PromQL engine. And really, PromQL engines concerned about load should harden themselves, including via Retry-After [3] that allow clients to back off gracefully when the service needs that, instead of relying on clients to guess about the load the service could handle and back off without insight into actual server capacity. [1]: https://github.com/openshift/enhancements/blame/158111ce156aac7fa6063a47c00e129c13033aec/enhancements/update/targeted-update-edge-blocking.md#L323-L325 [2]: https://issues.redhat.com/browse/OCPBUGS-19512 [3]: https://www.rfc-editor.org/rfc/rfc9110#name-retry-after --- pkg/clusterconditions/promql/promql.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/clusterconditions/promql/promql.go b/pkg/clusterconditions/promql/promql.go index 7a75046e81..19f20b6897 100644 --- a/pkg/clusterconditions/promql/promql.go +++ b/pkg/clusterconditions/promql/promql.go @@ -50,7 +50,7 @@ func NewPromQL(kubeClient kubernetes.Interface) *cache.Cache { }, QueryTimeout: 5 * time.Minute, }, - MinBetweenMatches: 10 * time.Minute, + MinBetweenMatches: 1 * time.Second, MinForCondition: time.Hour, Expiration: 24 * time.Hour, }