-
Notifications
You must be signed in to change notification settings - Fork 438
OCPBUGS-46379: Kas bootstrap bin #5871
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,142 @@ | ||
| package kasbootstrap | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "os" | ||
| "path/filepath" | ||
|
|
||
| configv1 "github.com/openshift/api/config/v1" | ||
|
|
||
| equality "k8s.io/apimachinery/pkg/api/equality" | ||
| "k8s.io/apimachinery/pkg/runtime" | ||
| "k8s.io/apimachinery/pkg/runtime/serializer" | ||
| utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||
| "k8s.io/apimachinery/pkg/util/sets" | ||
|
|
||
| ctrl "sigs.k8s.io/controller-runtime" | ||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||
| "sigs.k8s.io/controller-runtime/pkg/log/zap" | ||
|
|
||
| "go.uber.org/zap/zapcore" | ||
| ) | ||
|
|
||
| func init() { | ||
| utilruntime.Must(configv1.Install(configScheme)) | ||
| } | ||
|
|
||
| var ( | ||
| configScheme = runtime.NewScheme() | ||
| configCodecs = serializer.NewCodecFactory(configScheme) | ||
| ) | ||
|
|
||
| func run(ctx context.Context, opts Options) error { | ||
| logger := zap.New(zap.JSONEncoder(func(o *zapcore.EncoderConfig) { | ||
| o.EncodeTime = zapcore.RFC3339TimeEncoder | ||
| })) | ||
| ctrl.SetLogger(logger) | ||
|
|
||
| cfg, err := ctrl.GetConfig() | ||
| if err != nil { | ||
| return fmt.Errorf("failed to get config: %w", err) | ||
| } | ||
| c, err := client.New(cfg, client.Options{Scheme: configScheme}) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to create client: %w", err) | ||
| } | ||
|
|
||
| content, err := os.ReadFile(filepath.Join(opts.RenderedFeatureGatePath, "99_feature-gate.yaml")) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to read featureGate file: %w", err) | ||
| } | ||
|
|
||
| renderedFeatureGate, err := parseFeatureGateV1(content) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to parse featureGate file: %w", err) | ||
| } | ||
|
|
||
| if err := reconcileFeatureGate(ctx, c, renderedFeatureGate); err != nil { | ||
| return fmt.Errorf("failed to reconcile featureGate: %w", err) | ||
| } | ||
|
|
||
| // we want to keep the process running during the lifecycle of the Pod because the Pod runs with restartPolicy=Always | ||
| // and it's not possible for individual containers to have a dedicated restartPolicy like onFailure. | ||
|
|
||
| // start a goroutine that will close the done channel when the context is done. | ||
| done := make(chan struct{}) | ||
| go func() { | ||
| <-ctx.Done() | ||
| close(done) | ||
| }() | ||
|
|
||
| logger.Info("kas-bootstrap process completed successfully, waiting for termination signal") | ||
| <-done | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // reconcileFeatureGate reconciles the featureGate CR status appending the renderedFeatureGate status.featureGates to the existing featureGates. | ||
| // It will not fail if the clusterVersion is not found as this is expected for a brand new cluster. | ||
| // But it will remove any featureGates that are not in the clusterVersion.Status.History if it exists. | ||
| func reconcileFeatureGate(ctx context.Context, c client.Client, renderedFeatureGate *configv1.FeatureGate) error { | ||
| logger := ctrl.LoggerFrom(ctx).WithName("kas-bootstrap") | ||
|
|
||
| knownVersions := sets.NewString() | ||
| var clusterVersion configv1.ClusterVersion | ||
| err := c.Get(ctx, client.ObjectKey{Name: "version"}, &clusterVersion) | ||
| if err != nil { | ||
| // we don't fail if we can't get the clusterVersion, we will just not update the featureGate. | ||
| // This is always the case for a brand new cluster as the clusterVersion is not created yet. | ||
| logger.Info("WARNING: failed to get clusterVersion. This is expected for a brand new cluster", "error", err) | ||
| } else { | ||
| knownVersions = sets.NewString(clusterVersion.Status.Desired.Version) | ||
| for _, cvoVersion := range clusterVersion.Status.History { | ||
| knownVersions.Insert(cvoVersion.Version) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. standalone OCP currently doesn't do any garbage collection, so what you have now is fine as it stands. But once you hit your first for _, cvoVersion := range clusterVersion.Status.History {
knownVersions = sets.NewString(clusterVersion.Status.Desired.Version)
knownVersions.Insert(cvoVersion.Version)
if cvoVersion.State == configv1.CompletedUpdate {
break
}
}
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, updated the logic and unit coverage to reflect this |
||
|
|
||
| // Once we hit the first Completed entry and insert that into knownVersions | ||
| // we can break, because there shouldn't be anything left on the cluster that cares about those ancient releases anymore. | ||
| if cvoVersion.State == configv1.CompletedUpdate { | ||
| break | ||
| } | ||
| } | ||
| } | ||
|
|
||
| var featureGate configv1.FeatureGate | ||
| if err := c.Get(ctx, client.ObjectKey{Name: "cluster"}, &featureGate); err != nil { | ||
| return fmt.Errorf("failed to get featureGate: %w", err) | ||
| } | ||
|
|
||
| desiredFeatureGates := renderedFeatureGate.Status.FeatureGates | ||
| currentVersion := renderedFeatureGate.Status.FeatureGates[0].Version | ||
| for i := range featureGate.Status.FeatureGates { | ||
| featureGateValues := featureGate.Status.FeatureGates[i] | ||
| if featureGateValues.Version == currentVersion { | ||
| continue | ||
| } | ||
| if len(knownVersions) > 0 && !knownVersions.Has(featureGateValues.Version) { | ||
| continue | ||
| } | ||
| desiredFeatureGates = append(desiredFeatureGates, featureGateValues) | ||
| } | ||
|
|
||
| if equality.Semantic.DeepEqual(desiredFeatureGates, featureGate.Status.FeatureGates) { | ||
| logger.Info("There is no update for featureGate.Status.FeatureGates") | ||
| return nil | ||
| } | ||
|
|
||
| original := featureGate.DeepCopy() | ||
| featureGate.Status.FeatureGates = desiredFeatureGates | ||
| if err := c.Status().Patch(ctx, &featureGate, client.MergeFromWithOptions(original, client.MergeFromWithOptimisticLock{})); err != nil { | ||
| return fmt.Errorf("failed to update featureGate: %w", err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func parseFeatureGateV1(objBytes []byte) (*configv1.FeatureGate, error) { | ||
| requiredObj, err := runtime.Decode(configCodecs.UniversalDecoder(configv1.SchemeGroupVersion), objBytes) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to decode featureGate: %w", err) | ||
| } | ||
|
|
||
| return requiredObj.(*configv1.FeatureGate), nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you explain why we want to keep the process running?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
because the pod RestartPolicy is Always, this mimics current behaviour and I would defer deviating from it to a different change
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ack, any reason we are not adding this an an init container instead?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd guess it can't be an init container because it needs the actual API-server in another long-lived container in this same Pod to talk to. But couldn't we set a container-scoped
restartPolicy: OnFailurefor this container to get both:KubePodCrashLoopingif the container has trouble, while the container continues to relaunch and retry. Not as direct as having the controlling CPO know why the container was having trouble, but at least there would be a sign of trouble visible in Kube at a higher level than "dip into the container's logs".There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@wking this last comment led me to do a little bit of experimentation on a 4.19 ci cluster :)
Tried changing the restart policy of a side container under
.spec.containers, and failed admission with:* spec.template.spec.containers[1].restartPolicy: Forbidden: may not be set for non-init containersThen tried moving the container under
.spec.initContainerswith a restartPolicy ofOnFailure, and also failed admission with:* spec.template.spec.initContainers[0].restartPolicy: Unsupported value: "OnFailure": supported values: "Always"Then tried changing the initContainer restartPolicy to
Alwaysand the deployment was accepted. The init container ended up running as a side container (which was new to me :)). I could not see a difference though between the additional container under.spec.containersor the init container with restartPolicy as Always.Bottom line, I think what we have here is fine.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, the reason I didn't just set restart on failure for this container is that afaik individual containers can't supersede the Pod restartPolicy. Moving it to init as a side container will technically differ operationally from what we do at the moment and possibly causing more restarts while racing rendering for no value? so I'd rather keep it as it is to keep changes scoped and defer any further change to different PRs. After this one we'll still need to move the apply logic to this binary.
I added a comment in code to clarify about the restart policies.