diff --git a/cmd/machine-config-daemon/start.go b/cmd/machine-config-daemon/start.go index c205dea22d..f02c354ee1 100644 --- a/cmd/machine-config-daemon/start.go +++ b/cmd/machine-config-daemon/start.go @@ -182,6 +182,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { ctrlctx.InformerFactory.Machineconfiguration().V1().MachineConfigs(), ctrlctx.KubeInformerFactory.Core().V1().Nodes(), ctrlctx.InformerFactory.Machineconfiguration().V1().ControllerConfigs(), + ctrlctx.ClientBuilder.OperatorClientOrDie(componentName), startOpts.kubeletHealthzEnabled, startOpts.kubeletHealthzEndpoint, ctrlctx.FeatureGateAccess, @@ -194,6 +195,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { ctrlctx.KubeInformerFactory.Start(stopCh) ctrlctx.KubeNamespacedInformerFactory.Start(stopCh) ctrlctx.InformerFactory.Start(stopCh) + ctrlctx.OperatorInformerFactory.Start(stopCh) close(ctrlctx.InformersStarted) select { diff --git a/manifests/machineconfigdaemon/clusterrole.yaml b/manifests/machineconfigdaemon/clusterrole.yaml index b3d3e3c5dc..26804ca2ce 100644 --- a/manifests/machineconfigdaemon/clusterrole.yaml +++ b/manifests/machineconfigdaemon/clusterrole.yaml @@ -19,6 +19,9 @@ rules: resourceNames: ["privileged"] resources: ["securitycontextconstraints"] verbs: ["use"] +- apiGroups: ["operator.openshift.io"] + resources: ["machineconfigurations"] + verbs: ["get", "list", "watch"] - apiGroups: - authentication.k8s.io resources: diff --git a/pkg/apihelpers/apihelpers.go b/pkg/apihelpers/apihelpers.go index 70caaedaf7..1fefdbd2b1 100644 --- a/pkg/apihelpers/apihelpers.go +++ b/pkg/apihelpers/apihelpers.go @@ -8,8 +8,73 @@ import ( "fmt" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" + opv1 "github.com/openshift/api/operator/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/klog/v2" +) + +var ( + // This is the list of MCO's default node disruption policies. + defaultClusterPolicies = opv1.NodeDisruptionPolicyClusterStatus{ + Files: []opv1.NodeDisruptionPolicyStatusFile{ + { + Path: "/etc/mco/internal-registry-pull-secret.json", + Actions: []opv1.NodeDisruptionPolicyStatusAction{ + { + Type: opv1.NoneStatusAction, + }, + }, + }, + { + Path: "/var/lib/kubelet/config.json", + Actions: []opv1.NodeDisruptionPolicyStatusAction{ + { + Type: opv1.NoneStatusAction, + }, + }, + }, + { + Path: "/etc/machine-config-daemon/no-reboot/containers-gpg.pub", + Actions: []opv1.NodeDisruptionPolicyStatusAction{ + { + Type: opv1.ReloadStatusAction, + Reload: &opv1.ReloadService{ + ServiceName: "crio.service", + }, + }, + }, + }, + { + Path: "/etc/containers/policy.json", + Actions: []opv1.NodeDisruptionPolicyStatusAction{ + { + Type: opv1.ReloadStatusAction, + Reload: &opv1.ReloadService{ + ServiceName: "crio.service", + }, + }, + }, + }, + { + Path: "/etc/containers/registries.conf", + Actions: []opv1.NodeDisruptionPolicyStatusAction{ + { + Type: opv1.SpecialStatusAction, + }, + }, + }, + }, + SSHKey: opv1.NodeDisruptionPolicyStatusSSHKey{ + Actions: []opv1.NodeDisruptionPolicyStatusAction{ + { + Type: opv1.NoneStatusAction, + }, + }, + }, + } ) // NewMachineConfigPoolCondition creates a new MachineConfigPool condition. @@ -214,3 +279,120 @@ func IsControllerConfigCompleted(ccName string, ccGetter func(string) (*mcfgv1.C } return fmt.Errorf("ControllerConfig has not completed: completed(%v) running(%v) failing(%v)", completed, running, failing) } + +// Merges the cluster's default node disruption policies with the user defined policies, if any. +func MergeClusterPolicies(userDefinedClusterPolicies opv1.NodeDisruptionPolicyConfig) opv1.NodeDisruptionPolicyClusterStatus { + + mergedClusterPolicies := opv1.NodeDisruptionPolicyClusterStatus{} + + // Add default file policies to the merged list. + mergedClusterPolicies.Files = append(mergedClusterPolicies.Files, defaultClusterPolicies.Files...) + + // Iterate through user file policies. + // If there is a conflict with default policy, replace that entry in the merged list with the user defined policy. + // If there was no conflict, add the user defined policy as a new entry to the merged list. + for _, userDefinedPolicyFile := range userDefinedClusterPolicies.Files { + override := false + for i, defaultPolicyFile := range defaultClusterPolicies.Files { + if defaultPolicyFile.Path == userDefinedPolicyFile.Path { + mergedClusterPolicies.Files[i] = convertSpecFileToStatusFile(userDefinedPolicyFile) + override = true + break + } + } + if !override { + mergedClusterPolicies.Files = append(mergedClusterPolicies.Files, convertSpecFileToStatusFile(userDefinedPolicyFile)) + } + } + + // Add default service unit policies to the merged list. + mergedClusterPolicies.Units = append(mergedClusterPolicies.Units, defaultClusterPolicies.Units...) + + // Iterate through user service unit policies. + // If there is a conflict with default policy, replace that entry in the merged list with the user defined policy. + // If there was no conflict, add the user defined policy as a new entry to the merged list. + for _, userDefinedPolicyUnit := range userDefinedClusterPolicies.Units { + override := false + for i, defaultPolicyUnit := range defaultClusterPolicies.Units { + if defaultPolicyUnit.Name == userDefinedPolicyUnit.Name { + mergedClusterPolicies.Units[i] = convertSpecUnitToStatusUnit(userDefinedPolicyUnit) + override = true + break + } + } + if !override { + mergedClusterPolicies.Units = append(mergedClusterPolicies.Units, convertSpecUnitToStatusUnit(userDefinedPolicyUnit)) + } + } + + // If no user defined SSH policy exists, use the cluster defaults. + if len(userDefinedClusterPolicies.SSHKey.Actions) == 0 { + mergedClusterPolicies.SSHKey = *defaultClusterPolicies.SSHKey.DeepCopy() + } else { + mergedClusterPolicies.SSHKey = convertSpecSSHKeyToStatusSSHKey(*userDefinedClusterPolicies.SSHKey.DeepCopy()) + } + return mergedClusterPolicies +} + +// converts NodeDisruptionPolicySpecFile -> NodeDisruptionPolicyStatusFile +func convertSpecFileToStatusFile(specFile opv1.NodeDisruptionPolicySpecFile) opv1.NodeDisruptionPolicyStatusFile { + statusFile := opv1.NodeDisruptionPolicyStatusFile{Path: specFile.Path, Actions: []opv1.NodeDisruptionPolicyStatusAction{}} + for _, action := range specFile.Actions { + statusFile.Actions = append(statusFile.Actions, convertSpecActiontoStatusAction(action)) + } + return statusFile +} + +// converts NodeDisruptionPolicySpecUnit -> NodeDisruptionPolicyStatusUnit +func convertSpecUnitToStatusUnit(specUnit opv1.NodeDisruptionPolicySpecUnit) opv1.NodeDisruptionPolicyStatusUnit { + statusUnit := opv1.NodeDisruptionPolicyStatusUnit{Name: specUnit.Name, Actions: []opv1.NodeDisruptionPolicyStatusAction{}} + for _, action := range specUnit.Actions { + statusUnit.Actions = append(statusUnit.Actions, convertSpecActiontoStatusAction(action)) + } + return statusUnit +} + +// converts NodeDisruptionPolicySpecSSHKey -> NodeDisruptionPolicyStatusSSHKey +func convertSpecSSHKeyToStatusSSHKey(specSSHKey opv1.NodeDisruptionPolicySpecSSHKey) opv1.NodeDisruptionPolicyStatusSSHKey { + statusSSHKey := opv1.NodeDisruptionPolicyStatusSSHKey{Actions: []opv1.NodeDisruptionPolicyStatusAction{}} + for _, action := range specSSHKey.Actions { + statusSSHKey.Actions = append(statusSSHKey.Actions, convertSpecActiontoStatusAction(action)) + } + return statusSSHKey +} + +// converts NodeDisruptionPolicySpecAction -> NodeDisruptionPolicyStatusAction +func convertSpecActiontoStatusAction(action opv1.NodeDisruptionPolicySpecAction) opv1.NodeDisruptionPolicyStatusAction { + switch action.Type { + case opv1.DaemonReloadSpecAction: + return opv1.NodeDisruptionPolicyStatusAction{Type: opv1.DaemonReloadStatusAction} + case opv1.DrainSpecAction: + return opv1.NodeDisruptionPolicyStatusAction{Type: opv1.DrainStatusAction} + case opv1.NoneSpecAction: + return opv1.NodeDisruptionPolicyStatusAction{Type: opv1.NoneStatusAction} + case opv1.RebootSpecAction: + return opv1.NodeDisruptionPolicyStatusAction{Type: opv1.RebootStatusAction} + case opv1.ReloadSpecAction: + return opv1.NodeDisruptionPolicyStatusAction{Type: opv1.ReloadStatusAction, Reload: &opv1.ReloadService{ + ServiceName: action.Reload.ServiceName, + }} + case opv1.RestartSpecAction: + return opv1.NodeDisruptionPolicyStatusAction{Type: opv1.RestartStatusAction, Restart: &opv1.RestartService{ + ServiceName: action.Restart.ServiceName, + }} + default: // We should never be here as this is guarded by API validation. The return statement is to silence errors. + klog.Fatal("Unexpected action type found in Node Disruption Status calculation") + return opv1.NodeDisruptionPolicyStatusAction{Type: opv1.RebootStatusAction} + } +} + +// Checks if a list of NodeDisruptionActions contain any action from the set of target actions +func CheckNodeDisruptionActionsForTargetActions(actions []opv1.NodeDisruptionPolicyStatusAction, targetActions ...opv1.NodeDisruptionPolicyStatusActionType) bool { + + currentActions := sets.New[opv1.NodeDisruptionPolicyStatusActionType]() + for _, action := range actions { + currentActions.Insert(action.Type) + } + + return currentActions.HasAny(targetActions...) +} diff --git a/pkg/controller/common/helpers.go b/pkg/controller/common/helpers.go index a094c347ea..eb79b7722f 100644 --- a/pkg/controller/common/helpers.go +++ b/pkg/controller/common/helpers.go @@ -970,6 +970,8 @@ func dedupePasswdUserSSHKeys(passwdUser ign2types.PasswdUser) ign2types.PasswdUs // CalculateConfigFileDiffs compares the files present in two ignition configurations and returns the list of files // that are different between them +// +//nolint:dupl func CalculateConfigFileDiffs(oldIgnConfig, newIgnConfig *ign3types.Config) []string { // Go through the files and see what is new or different oldFileSet := make(map[string]ign3types.File) @@ -986,8 +988,6 @@ func CalculateConfigFileDiffs(oldIgnConfig, newIgnConfig *ign3types.Config) []st for path := range oldFileSet { _, ok := newFileSet[path] if !ok { - // debug: remove - klog.Infof("File diff: %v was deleted", path) diffFileSet = append(diffFileSet, path) } } @@ -996,18 +996,50 @@ func CalculateConfigFileDiffs(oldIgnConfig, newIgnConfig *ign3types.Config) []st for path, newFile := range newFileSet { oldFile, ok := oldFileSet[path] if !ok { - // debug: remove - klog.Infof("File diff: %v was added", path) diffFileSet = append(diffFileSet, path) } else if !reflect.DeepEqual(oldFile, newFile) { - // debug: remove - klog.Infof("File diff: detected change to %v", newFile.Path) diffFileSet = append(diffFileSet, path) } } return diffFileSet } +// CalculateConfigUnitDiffs compares the units present in two ignition configurations and returns the list of units +// that are different between them +// +//nolint:dupl +func CalculateConfigUnitDiffs(oldIgnConfig, newIgnConfig *ign3types.Config) []string { + // Go through the units and see what is new or different + oldUnitSet := make(map[string]ign3types.Unit) + for _, u := range oldIgnConfig.Systemd.Units { + oldUnitSet[u.Name] = u + } + newUnitSet := make(map[string]ign3types.Unit) + for _, u := range newIgnConfig.Systemd.Units { + newUnitSet[u.Name] = u + } + diffUnitSet := []string{} + + // First check if any units were removed + for unit := range oldUnitSet { + _, ok := newUnitSet[unit] + if !ok { + diffUnitSet = append(diffUnitSet, unit) + } + } + + // Now check if any units were added/changed + for name, newUnit := range newUnitSet { + oldUnit, ok := oldUnitSet[name] + if !ok { + diffUnitSet = append(diffUnitSet, name) + } else if !reflect.DeepEqual(oldUnit, newUnit) { + diffUnitSet = append(diffUnitSet, name) + } + } + return diffUnitSet +} + // NewIgnFile returns a simple ignition3 file from just path and file contents. // It also ensures the compression field is set to the empty string, which is // currently required for ensuring child configs that may be merged layer diff --git a/pkg/daemon/constants/constants.go b/pkg/daemon/constants/constants.go index 6619cbec53..9dca9faab2 100644 --- a/pkg/daemon/constants/constants.go +++ b/pkg/daemon/constants/constants.go @@ -108,4 +108,7 @@ const ( // CRIOServiceName is used to specify reloads and restarts of the CRI-O service CRIOServiceName = "crio" + + // DaemonReloadCommand is used to specify reloads and restarts of the systemd manager configuration + DaemonReloadCommand = "daemon-reload" ) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 598cfcda78..56334788f5 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -20,6 +20,7 @@ import ( "time" mcfgclientset "github.com/openshift/client-go/machineconfiguration/clientset/versioned" + mcopclientset "github.com/openshift/client-go/operator/clientset/versioned" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" ign3types "github.com/coreos/ignition/v2/config/v3_4/types" @@ -44,6 +45,7 @@ import ( mcfgalphav1 "github.com/openshift/api/machineconfiguration/v1alpha1" mcfginformersv1 "github.com/openshift/client-go/machineconfiguration/informers/externalversions/machineconfiguration/v1" mcfglistersv1 "github.com/openshift/client-go/machineconfiguration/listers/machineconfiguration/v1" + mcoResourceRead "github.com/openshift/machine-config-operator/lib/resourceread" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" "github.com/openshift/machine-config-operator/pkg/daemon/constants" @@ -86,6 +88,9 @@ type Daemon struct { mcfgClient mcfgclientset.Interface + // mcopClient allows interaction with Openshift operator level objects, such as MachineConfiguration + mcopClient mcopclientset.Interface + // nodeLister is used to watch for updates via the informer nodeLister corev1lister.NodeLister nodeListerSynced cache.InformerSynced @@ -348,6 +353,7 @@ func (dn *Daemon) ClusterConnect( mcInformer mcfginformersv1.MachineConfigInformer, nodeInformer coreinformersv1.NodeInformer, ccInformer mcfginformersv1.ControllerConfigInformer, + mcopClient mcopclientset.Interface, kubeletHealthzEnabled bool, kubeletHealthzEndpoint string, featureGatesAccessor featuregates.FeatureGateAccess, @@ -355,7 +361,7 @@ func (dn *Daemon) ClusterConnect( dn.name = name dn.kubeClient = kubeClient dn.mcfgClient = mcfgClient - + dn.mcopClient = mcopClient // Other controllers start out with the default controller limiter which retries // in milliseconds; since any change here will involve rebooting the node // we don't need to react in milliseconds. See also updateDelay above. diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index 50101a8c45..63dc915827 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -22,6 +22,8 @@ import ( "k8s.io/apimachinery/pkg/util/diff" kubeinformers "k8s.io/client-go/informers" k8sfake "k8s.io/client-go/kubernetes/fake" + + mcopfake "github.com/openshift/client-go/operator/clientset/versioned/fake" core "k8s.io/client-go/testing" "k8s.io/client-go/tools/cache" @@ -115,6 +117,7 @@ type fixture struct { client *fake.Clientset kubeclient *k8sfake.Clientset + oclient *mcopfake.Clientset mcLister []*mcfgv1.MachineConfig nodeLister []*corev1.Node @@ -124,6 +127,7 @@ type fixture struct { objects []runtime.Object kubeobjects []runtime.Object + oObjects []runtime.Object } func newFixture(t *testing.T) *fixture { @@ -142,6 +146,7 @@ var ( func (f *fixture) newController() *Daemon { f.client = fake.NewSimpleClientset(f.objects...) f.kubeclient = k8sfake.NewSimpleClientset(f.kubeobjects...) + f.oclient = mcopfake.NewSimpleClientset(f.oObjects...) i := informers.NewSharedInformerFactory(f.client, noResyncPeriodFunc()) k8sI := kubeinformers.NewSharedInformerFactory(f.kubeclient, noResyncPeriodFunc()) @@ -156,6 +161,7 @@ func (f *fixture) newController() *Daemon { i.Machineconfiguration().V1().MachineConfigs(), k8sI.Core().V1().Nodes(), i.Machineconfiguration().V1().ControllerConfigs(), + f.oclient, false, "", d.featureGatesAccessor, diff --git a/pkg/daemon/drain.go b/pkg/daemon/drain.go index 3b75ee9c01..4d22f5e70d 100644 --- a/pkg/daemon/drain.go +++ b/pkg/daemon/drain.go @@ -10,6 +10,8 @@ import ( "github.com/containers/image/v5/pkg/sysregistriesv2" ign3types "github.com/coreos/ignition/v2/config/v3_4/types" mcfgalphav1 "github.com/openshift/api/machineconfiguration/v1alpha1" + opv1 "github.com/openshift/api/operator/v1" + "github.com/openshift/machine-config-operator/pkg/apihelpers" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" "github.com/openshift/machine-config-operator/pkg/daemon/constants" "github.com/openshift/machine-config-operator/pkg/upgrademonitor" @@ -114,6 +116,24 @@ func (dn *Daemon) performDrain() error { return nil } +// isDrainRequiredForNodeDisruptionActions determines whether node drain is required or not to apply config changes for this set of NodeDisruptionActions +func isDrainRequiredForNodeDisruptionActions(actions []opv1.NodeDisruptionPolicyStatusAction, oldIgnConfig, newIgnConfig ign3types.Config) (bool, error) { + klog.Infof("Checking drain required for node disruption actions") + if apihelpers.CheckNodeDisruptionActionsForTargetActions(actions, opv1.RebootStatusAction, opv1.DrainStatusAction) { + // We definitely want to perform drain for these cases + return true, nil + } else if apihelpers.CheckNodeDisruptionActionsForTargetActions(actions, opv1.SpecialStatusAction) { + // This is a specially reserved action for "/etc/containers/registries.conf" and for this action, drain may or may not be necessary + isSafe, err := isSafeContainerRegistryConfChanges(oldIgnConfig, newIgnConfig) + if err != nil { + return false, err + } + return !isSafe, nil + } + // If only other actions are being done, no drain is necessary + return false, nil +} + // isDrainRequired determines whether node drain is required or not to apply config changes. func isDrainRequired(actions, diffFileSet []string, oldIgnConfig, newIgnConfig ign3types.Config, overrideImageRegistryDrain bool) (bool, error) { if ctrlcommon.InSlice(postConfigChangeActionReboot, actions) { diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index f83d7d9f17..0a80228d65 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -25,11 +25,17 @@ import ( "k8s.io/apimachinery/pkg/runtime" kubeErrs "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" + v1 "github.com/openshift/api/config/v1" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" mcfgalphav1 "github.com/openshift/api/machineconfiguration/v1alpha1" + "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" + opv1 "github.com/openshift/api/operator/v1" + + "github.com/openshift/machine-config-operator/pkg/apihelpers" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" "github.com/openshift/machine-config-operator/pkg/daemon/constants" pivottypes "github.com/openshift/machine-config-operator/pkg/daemon/pivot/types" @@ -83,6 +89,146 @@ func reloadService(name string) error { return runCmdSync("systemctl", "reload", name) } +func reloadDaemon() error { + return runCmdSync("systemctl", constants.DaemonReloadCommand) +} + +func (dn *Daemon) finishRebootlessUpdate() error { + // Get current state of node, in case of an error reboot + state, err := dn.getStateAndConfigs() + if err != nil { + return fmt.Errorf("could not apply update: error processing state and configs. Error: %w", err) + } + + var inDesiredConfig bool + var missingODC bool + if missingODC, inDesiredConfig, err = dn.updateConfigAndState(state); err != nil { + return fmt.Errorf("could not apply update: setting node's state to Done failed. Error: %w", err) + } + + if missingODC { + return fmt.Errorf("error updating state.currentconfig from on-disk currentconfig") + } + + if inDesiredConfig { + // (re)start the config drift monitor since rebooting isn't needed. + dn.startConfigDriftMonitor() + return nil + } + + // currentConfig != desiredConfig, kick off an update + return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig, true) +} + +func (dn *Daemon) executeReloadServiceNodeDisruptionAction(serviceName string, reloadErr error) error { + if reloadErr != nil { + if dn.nodeWriter != nil { + dn.nodeWriter.Eventf(corev1.EventTypeWarning, "FailedServiceReload", fmt.Sprintf("Reloading service %s failed. Error: %v", serviceName, reloadErr)) + } + return fmt.Errorf("could not apply update: reloading %s configuration failed. Error: %w", serviceName, reloadErr) + } + + err := upgrademonitor.GenerateAndApplyMachineConfigNodes( + &upgrademonitor.Condition{State: mcfgalphav1.MachineConfigNodeUpdatePostActionComplete, Reason: string(mcfgalphav1.MachineConfigNodeUpdateReloaded), Message: fmt.Sprintf("Node has reloaded service %s", serviceName)}, + &upgrademonitor.Condition{State: mcfgalphav1.MachineConfigNodeUpdateReloaded, Reason: fmt.Sprintf("%s%s", string(mcfgalphav1.MachineConfigNodeUpdatePostActionComplete), string(mcfgalphav1.MachineConfigNodeUpdateReloaded)), Message: fmt.Sprintf("Upgrade required a service %s reload. Completed this this as a post update action.", serviceName)}, + metav1.ConditionTrue, + metav1.ConditionTrue, + dn.node, + dn.mcfgClient, + dn.featureGatesAccessor, + ) + if err != nil { + klog.Errorf("Error making MCN for Reloading success: %v", err) + } + + if dn.nodeWriter != nil { + dn.nodeWriter.Eventf(corev1.EventTypeNormal, "ServiceReload", "Config changes do not require reboot. Service %s was reloaded.", serviceName) + } + logSystem("%s service reloaded successfully!", serviceName) + return nil +} + +// performPostConfigChangeNodeDisruptionAction takes action based on the cluster's Node disruption policies. +// For non-reboot action, it applies configuration, updates node's config and state. +// In the end uncordon node to schedule workload. +// If at any point an error occurs, we reboot the node so that node has correct configuration. +func (dn *Daemon) performPostConfigChangeNodeDisruptionAction(postConfigChangeActions []opv1.NodeDisruptionPolicyStatusAction, configName string) error { + + logSystem("Executing performPostConfigChangeNodeDisruptionAction(drain already complete/skipped) for config %s", configName) + for _, action := range postConfigChangeActions { + logSystem("Executing postconfig action: %v for config %s", action.Type, configName) + if action.Type == opv1.RebootStatusAction { + err := upgrademonitor.GenerateAndApplyMachineConfigNodes( + &upgrademonitor.Condition{State: mcfgalphav1.MachineConfigNodeUpdatePostActionComplete, Reason: string(mcfgalphav1.MachineConfigNodeUpdateRebooted), Message: fmt.Sprintf("Node will reboot into config %s", configName)}, + &upgrademonitor.Condition{State: mcfgalphav1.MachineConfigNodeUpdateRebooted, Reason: fmt.Sprintf("%s%s", string(mcfgalphav1.MachineConfigNodeUpdatePostActionComplete), string(mcfgalphav1.MachineConfigNodeUpdateRebooted)), Message: "Upgrade requires a reboot. Currently doing this as the post update action."}, + metav1.ConditionUnknown, + metav1.ConditionUnknown, + dn.node, + dn.mcfgClient, + dn.featureGatesAccessor, + ) + if err != nil { + klog.Errorf("Error making MCN for rebooting: %v", err) + } + logSystem("Rebooting node") + return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) + } else if action.Type == opv1.NoneStatusAction { + if dn.nodeWriter != nil { + dn.nodeWriter.Eventf(corev1.EventTypeNormal, "SkipReboot", "Config changes do not require reboot.") + } + err := upgrademonitor.GenerateAndApplyMachineConfigNodes( + &upgrademonitor.Condition{State: mcfgalphav1.MachineConfigNodeUpdatePostActionComplete, Reason: "None", Message: "Changes do not require a reboot"}, + nil, + metav1.ConditionTrue, + metav1.ConditionFalse, + dn.node, + dn.mcfgClient, + dn.featureGatesAccessor, + ) + if err != nil { + klog.Errorf("Error making MCN for no post config change action: %v", err) + } + logSystem("Node has Desired Config %s, skipping reboot", configName) + } else if action.Type == opv1.RestartStatusAction { + + serviceName := string(action.Restart.ServiceName) + + if err := restartService(serviceName); err != nil { + if dn.nodeWriter != nil { + dn.nodeWriter.Eventf(corev1.EventTypeWarning, "FailedServiceRestart", fmt.Sprintf("Restarting %s service failed. Error: %v", serviceName, err)) + } + return fmt.Errorf("could not apply update: restarting %s service failed. Error: %w", serviceName, err) + } + // TODO: Add a new MCN Condition to the API for service restarts? + if dn.nodeWriter != nil { + dn.nodeWriter.Eventf(corev1.EventTypeNormal, "ServiceRestart", "Config changes do not require reboot. Service %s was restarted.", serviceName) + } + logSystem("%s service restarted successfully!", serviceName) + + } else if action.Type == opv1.ReloadStatusAction { + // Execute a generic service reload defined by the action object + serviceName := string(action.Reload.ServiceName) + if err := dn.executeReloadServiceNodeDisruptionAction(serviceName, reloadService(serviceName)); err != nil { + return err + } + + } else if action.Type == opv1.SpecialStatusAction { + // The special action type requires a CRIO reload + if err := dn.executeReloadServiceNodeDisruptionAction(constants.CRIOServiceName, reloadService(constants.CRIOServiceName)); err != nil { + return err + } + } else if action.Type == opv1.DaemonReloadStatusAction { + // Execute daemon-reload + if err := dn.executeReloadServiceNodeDisruptionAction(constants.DaemonReloadCommand, reloadDaemon()); err != nil { + return err + } + } + } + + // We are here, which means a reboot was not needed to apply the configuration. + return dn.finishRebootlessUpdate() +} + // performPostConfigChangeAction takes action based on what postConfigChangeAction has been asked. // For non-reboot action, it applies configuration, updates node's config and state. // In the end uncordon node to schedule workload. @@ -173,32 +319,9 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string logSystem("%s config restarted successfully! Desired config %s has been applied, skipping reboot", serviceName, configName) } - // We are here, which means reboot was not needed to apply the configuration. - // Get current state of node, in case of an error reboot - state, err := dn.getStateAndConfigs() - if err != nil { - return fmt.Errorf("could not apply update: error processing state and configs. Error: %w", err) - } - - var inDesiredConfig bool - var missingODC bool - if missingODC, inDesiredConfig, err = dn.updateConfigAndState(state); err != nil { - return fmt.Errorf("could not apply update: setting node's state to Done failed. Error: %w", err) - } - - if missingODC { - return fmt.Errorf("error updating state.currentconfig from on-disk currentconfig") - } - - if inDesiredConfig { - // (re)start the config drift monitor since rebooting isn't needed. - dn.startConfigDriftMonitor() - return nil - } - - // currentConfig != desiredConfig, kick off an update - return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig, true) + // We are here, which means a reboot was not needed to apply the configuration. + return dn.finishRebootlessUpdate() } func setRunningKargsWithCmdline(config *mcfgv1.MachineConfig, requestedKargs []string, cmdline []byte) error { @@ -438,7 +561,7 @@ func (dn *CoreOSDaemon) applyOSChanges(mcDiff machineConfigDiff, oldConfig, newC return nil } -func calculatePostConfigChangeActionFromFileDiffs(diffFileSet []string) (actions []string) { +func calculatePostConfigChangeActionFromMCDiffs(diffFileSet []string) (actions []string) { filesPostConfigChangeActionNone := []string{ caBundleFilePath, imageRegistryAuthFile, @@ -457,6 +580,7 @@ func calculatePostConfigChangeActionFromFileDiffs(diffFileSet []string) (actions } actions = []string{postConfigChangeActionNone} + for _, path := range diffFileSet { if ctrlcommon.InSlice(path, filesPostConfigChangeActionNone) { continue @@ -474,6 +598,96 @@ func calculatePostConfigChangeActionFromFileDiffs(diffFileSet []string) (actions return } +// calculatePostConfigChangeNodeDisruptionActionFromMCDiffs takes action based on the cluster's Node disruption policies. +func calculatePostConfigChangeNodeDisruptionActionFromMCDiffs(diffSSH bool, diffFileSet, diffUnitSet []string, clusterPolicies opv1.NodeDisruptionPolicyClusterStatus) []opv1.NodeDisruptionPolicyStatusAction { + actions := []opv1.NodeDisruptionPolicyStatusAction{} + + // Step through all file based policies, and build out the actions object + for _, diffPath := range diffFileSet { + pathFound := false + for _, policyFile := range clusterPolicies.Files { + klog.V(4).Infof("comparing policy path %s to diff path %s", policyFile.Path, diffPath) + if policyFile.Path == diffPath { + klog.Infof("NodeDisruptionPolicy found for diff file %s!", diffPath) + actions = append(actions, policyFile.Actions...) + pathFound = true + break + } + } + if !pathFound { + // Hack for https://github.com/openshift/machine-config-operator/pull/4160#discussion_r1548669673 here + // Changes to files in /etc/containers/registries.d should cause a CRIO reload + // This should be removed once NodeDisruptionPolicy can support directories and wildcards + if filepath.Dir(diffPath) == constants.SigstoreRegistriesConfigDir { + klog.Infof("Exception Action: diffPath %s is a subdir of %s, adding a CRIO reload", diffPath, constants.SigstoreRegistriesConfigDir) + actions = append(actions, opv1.NodeDisruptionPolicyStatusAction{Type: opv1.ReloadStatusAction, Reload: &opv1.ReloadService{ + ServiceName: constants.CRIOServiceName, + }}) + continue + } + // If this file path has no policy defined, default to reboot + klog.V(4).Infof("no policy found for diff path %s", diffPath) + return []opv1.NodeDisruptionPolicyStatusAction{{ + Type: opv1.RebootStatusAction, + }} + } + } + + // Step through all unit based policies, and build out the actions object + for _, diffUnit := range diffUnitSet { + unitFound := false + for _, policyUnit := range clusterPolicies.Units { + klog.V(4).Infof("comparing policy unit name %s to diff unit name %s", string(policyUnit.Name), diffUnit) + if string(policyUnit.Name) == diffUnit { + klog.Infof("NodeDisruptionPolicy found for diff unit %s!", diffUnit) + actions = append(actions, policyUnit.Actions...) + unitFound = true + break + } + } + if !unitFound { + // If this unit has no policy defined, default to reboot + klog.V(4).Infof("no policy found for diff unit %s", diffUnit) + return []opv1.NodeDisruptionPolicyStatusAction{{ + Type: opv1.RebootStatusAction, + }} + } + } + + // SSH only has one possible policy(and there is a default), so blindly add that if there is an SSH diff + if diffSSH { + klog.Infof("SSH diff detected, applying SSH policy") + actions = append(actions, clusterPolicies.SSHKey.Actions...) + } + + // If any of the actions need a reboot, then just return a single Reboot action + if apihelpers.CheckNodeDisruptionActionsForTargetActions(actions, opv1.RebootStatusAction) { + return []opv1.NodeDisruptionPolicyStatusAction{{ + Type: opv1.RebootStatusAction, + }} + } + + // If there is a "None" action in conjunction with other kinds of actions, strip out the "None" action elements as it is redundant + if apihelpers.CheckNodeDisruptionActionsForTargetActions(actions, opv1.NoneStatusAction) { + if apihelpers.CheckNodeDisruptionActionsForTargetActions(actions, opv1.DrainStatusAction, opv1.ReloadStatusAction, opv1.RestartStatusAction, opv1.DaemonReloadStatusAction, opv1.SpecialStatusAction) { + finalActions := []opv1.NodeDisruptionPolicyStatusAction{} + for _, action := range actions { + if action.Type != opv1.NoneStatusAction { + finalActions = append(finalActions, action) + } + } + return finalActions + } + // If we're here, this means that the action list has only "None" actions; return a single "None" Action + return []opv1.NodeDisruptionPolicyStatusAction{{ + Type: opv1.NoneStatusAction, + }} + } + + // If we're here, return as is - this means action list had zero "None" actions in the list + return actions +} + func calculatePostConfigChangeAction(diff *machineConfigDiff, diffFileSet []string) ([]string, error) { // If a machine-config-daemon-force file is present, it means the user wants to // move to desired state without additional validation. We will reboot the node in @@ -491,8 +705,86 @@ func calculatePostConfigChangeAction(diff *machineConfigDiff, diffFileSet []stri return []string{postConfigChangeActionReboot}, nil } - // We don't actually have to consider ssh keys changes, which is the only section of passwd that is allowed to change - return calculatePostConfigChangeActionFromFileDiffs(diffFileSet), nil + // Calculate actions based on file, unit and ssh diffs + return calculatePostConfigChangeActionFromMCDiffs(diffFileSet), nil +} + +// calculatePostConfigChangeNodeDisruptionAction takes action based on the cluster's Node disruption policies. +func (dn *Daemon) calculatePostConfigChangeNodeDisruptionAction(diff *machineConfigDiff, diffFileSet, diffUnitSet []string) ([]opv1.NodeDisruptionPolicyStatusAction, error) { + + var mcop *opv1.MachineConfiguration + var err error + // Wait for mcop.Status.NodeDisruptionPolicyStatus to populate, otherwise error out. This shouldn't take very long + // as this is done by the operator sync loop, but may be extended if transitioning to TechPreview as the operator restarts, + if err = wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { + mcop, err = dn.mcopClient.OperatorV1().MachineConfigurations().Get(context.TODO(), ctrlcommon.MCOOperatorKnobsObjectName, metav1.GetOptions{}) + if err != nil { + klog.Errorf("calculating NodeDisruptionPolicies: MachineConfiguration/cluster has not been created yet") + err = fmt.Errorf("MachineConfiguration/cluster has not been created yet") + return false, nil + } + // There will always be atleast five file policies, if they don't exist, then the Status hasn't been populated yet + // + // TODO: When Conditions on this object are implemented; this check could be updated to only proceed when + // status.ObservedGeneration matches the last generation of MachineConfiguration + if len(mcop.Status.NodeDisruptionPolicyStatus.ClusterPolicies.Files) == 0 { + klog.Errorf("calculating NodeDisruptionPolicies: NodeDisruptionPolicyStatus has not been populated yet") + err = fmt.Errorf("NodeDisruptionPolicyStatus has not been populated yet") + return false, nil + } + return true, nil + }); err != nil { + klog.Errorf("NodeDisruptionPolicyStatus was not ready: %v", err) + err = fmt.Errorf("NodeDisruptionPolicyStatus was not ready: %v", err) + return nil, err + } + + // Continue policy calculation if no errors were encountered in fetching the policy. + // If a machine-config-daemon-force file is present, it means the user wants to + // move to desired state without additional validation. We will reboot the node in + // this case regardless of what MachineConfig diff is. + klog.Infof("Calculating node disruption actions") + if _, err = os.Stat(constants.MachineConfigDaemonForceFile); err == nil { + if err = os.Remove(constants.MachineConfigDaemonForceFile); err != nil { + return []opv1.NodeDisruptionPolicyStatusAction{}, fmt.Errorf("failed to remove force validation file: %w", err) + } + klog.Infof("Setting post config change node disruption action to Reboot; %s present", constants.MachineConfigDaemonForceFile) + return []opv1.NodeDisruptionPolicyStatusAction{{ + Type: opv1.RebootStatusAction, + }}, nil + } + + if diff.osUpdate || diff.kargs || diff.fips || diff.kernelType || diff.extensions { + // must reboot + return []opv1.NodeDisruptionPolicyStatusAction{{ + Type: opv1.RebootStatusAction, + }}, nil + } + if !diff.files && !diff.units && !diff.passwd { + // This is a diff which requires no actions + klog.Infof("No changes in files, units or SSH keys, no NodeDisruptionPolicies are in effect") + return []opv1.NodeDisruptionPolicyStatusAction{{ + Type: opv1.NoneStatusAction, + }}, nil + } + + // Calculate actions based on file, unit and ssh diffs + nodeDisruptionActions := calculatePostConfigChangeNodeDisruptionActionFromMCDiffs(diff.passwd, diffFileSet, diffUnitSet, mcop.Status.NodeDisruptionPolicyStatus.ClusterPolicies) + + // Print out node disruption actions for debug purposes + klog.Infof("Calculated node disruption actions:") + for _, action := range nodeDisruptionActions { + if action.Type == opv1.ReloadStatusAction { + klog.Infof("%v - %v", action.Type, action.Reload.ServiceName) + } else if action.Type == opv1.RestartStatusAction { + klog.Infof("%v - %v", action.Type, action.Restart.ServiceName) + } else { + klog.Infof("%v", action.Type) + } + } + + return nodeDisruptionActions, nil + } // This is another update function implementation for the special case of @@ -732,7 +1024,35 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi logSystem("Starting update from %s to %s: %+v", oldConfigName, newConfigName, diff) diffFileSet := ctrlcommon.CalculateConfigFileDiffs(&oldIgnConfig, &newIgnConfig) - actions, err := calculatePostConfigChangeAction(diff, diffFileSet) + diffUnitSet := ctrlcommon.CalculateConfigUnitDiffs(&oldIgnConfig, &newIgnConfig) + + var fg featuregates.FeatureGate + + // This check is needed as featureGatesAccessor is not present during first boot. During firstboot + // the daemon will always do a reboot and NodeDisruptionPolicies are not active. + if dn.featureGatesAccessor != nil { + fg, err = dn.featureGatesAccessor.CurrentFeatureGates() + if err != nil { + klog.Errorf("Could not get fg: %v", err) + return err + } + } + + var nodeDisruptionActions []opv1.NodeDisruptionPolicyStatusAction + var nodeDisruptionError error + var actions []string + // If FeatureGateNodeDisruptionPolicy is set, calculate NodeDisruptionPolicy based actions for this MC diff + if fg != nil && fg.Enabled(v1.FeatureGateNodeDisruptionPolicy) { + nodeDisruptionActions, nodeDisruptionError = dn.calculatePostConfigChangeNodeDisruptionAction(diff, diffFileSet, diffUnitSet) + if nodeDisruptionError != nil { + // TODO: Fallback to legacy path and signal failure here + klog.Errorf("could not calculate node disruption actions: %v", nodeDisruptionError) + actions, err = calculatePostConfigChangeAction(diff, diffFileSet) + } + } else { + actions, err = calculatePostConfigChangeAction(diff, diffFileSet) + } + if err != nil { Nerr := upgrademonitor.GenerateAndApplyMachineConfigNodes( &upgrademonitor.Condition{State: mcfgalphav1.MachineConfigNodeUpdatePrepared, Reason: string(mcfgalphav1.MachineConfigNodeUpdateCompatible), Message: "Update Failed during the Checking for Compatibility phase."}, @@ -749,15 +1069,24 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi return err } - // Check and perform node drain if required - crioOverrideConfigmapExists, err := dn.hasImageRegistryDrainOverrideConfigMap() - if err != nil { - return err - } - - drain, err := isDrainRequired(actions, diffFileSet, oldIgnConfig, newIgnConfig, crioOverrideConfigmapExists) - if err != nil { - return err + var drain bool + if fg != nil && fg.Enabled(v1.FeatureGateNodeDisruptionPolicy) && nodeDisruptionError == nil { + // Check actions list and perform node drain if required + drain, err = isDrainRequiredForNodeDisruptionActions(nodeDisruptionActions, oldIgnConfig, newIgnConfig) + if err != nil { + return err + } + klog.Infof("Drain calculated for node disruption: %v", drain) + } else { + // Check and perform node drain if required + crioOverrideConfigmapExists, err := dn.hasImageRegistryDrainOverrideConfigMap() + if err != nil { + return err + } + drain, err = isDrainRequired(actions, diffFileSet, oldIgnConfig, newIgnConfig, crioOverrideConfigmapExists) + if err != nil { + return err + } } err = upgrademonitor.GenerateAndApplyMachineConfigNodes( &upgrademonitor.Condition{State: mcfgalphav1.MachineConfigNodeUpdatePrepared, Reason: string(mcfgalphav1.MachineConfigNodeUpdateCompatible), Message: "Update is Compatible."}, @@ -943,11 +1272,11 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi klog.Errorf("Error making MCN for Updated Files and OS: %v", err) } - err = dn.performPostConfigChangeAction(actions, newConfig.GetName()) - if err != nil { - return err + if fg != nil && fg.Enabled(v1.FeatureGateNodeDisruptionPolicy) && nodeDisruptionError == nil { + return dn.performPostConfigChangeNodeDisruptionAction(nodeDisruptionActions, newConfig.GetName()) } - return nil + // If we're here, FeatureGateNodeDisruptionPolicy is off/errored, so perform legacy action + return dn.performPostConfigChangeAction(actions, newConfig.GetName()) } // This is currently a subsection copied over from update() since we need to be more nuanced. Should eventually diff --git a/pkg/daemon/upgrade_monitor_test.go b/pkg/daemon/upgrade_monitor_test.go index a9f72c69fa..636ab381e7 100644 --- a/pkg/daemon/upgrade_monitor_test.go +++ b/pkg/daemon/upgrade_monitor_test.go @@ -15,6 +15,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" kubeinformers "k8s.io/client-go/informers" k8sfake "k8s.io/client-go/kubernetes/fake" + + mcopfake "github.com/openshift/client-go/operator/clientset/versioned/fake" ) type upgradeMonitorTestCase struct { @@ -103,6 +105,8 @@ func (tc upgradeMonitorTestCase) run(t *testing.T) { f.kubeobjects = []runtime.Object{} f.client = fake.NewSimpleClientset(f.objects...) f.kubeclient = k8sfake.NewSimpleClientset(f.kubeobjects...) + + f.oclient = mcopfake.NewSimpleClientset(f.objects...) fgAccess := featuregates.NewHardcodedFeatureGateAccess( []apicfgv1.FeatureGateName{ apicfgv1.FeatureGateMachineConfigNodes, @@ -128,6 +132,7 @@ func (tc upgradeMonitorTestCase) run(t *testing.T) { i.Machineconfiguration().V1().MachineConfigs(), k8sI.Core().V1().Nodes(), i.Machineconfiguration().V1().ControllerConfigs(), + f.oclient, false, "", fgAccess, diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 8a0b2928d0..78483e8089 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -436,13 +436,13 @@ func (optr *Operator) sync(key string) error { // "RenderConfig" must always run first as it sets the renderConfig in the operator // for the sync funcs below {"RenderConfig", optr.syncRenderConfig}, + {"MachineConfiguration", optr.syncMachineConfiguration}, {"MachineConfigNode", optr.syncMachineConfigNodes}, {"MachineConfigPools", optr.syncMachineConfigPools}, {"MachineConfigDaemon", optr.syncMachineConfigDaemon}, {"MachineConfigController", optr.syncMachineConfigController}, {"MachineConfigServer", optr.syncMachineConfigServer}, {"MachineOSBuilder", optr.syncMachineOSBuilder}, - {"MachineConfiguration", optr.syncMachineConfiguration}, // this check must always run last since it makes sure the pools are in sync/upgrading correctly {"RequiredPools", optr.syncRequiredMachineConfigPools}, } diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index 3ce9b30963..307e418390 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -11,6 +11,7 @@ import ( "net" "net/url" "os" + "reflect" "strconv" "strings" "time" @@ -35,6 +36,7 @@ import ( configv1 "github.com/openshift/api/config/v1" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" v1alpha1 "github.com/openshift/api/machineconfiguration/v1alpha1" + opv1 "github.com/openshift/api/operator/v1" mcoac "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" "github.com/openshift/library-go/pkg/operator/resource/resourceapply" @@ -1413,7 +1415,7 @@ func (optr *Operator) syncMachineConfigServer(config *renderConfig) error { // syncRequiredMachineConfigPools ensures that all the nodes in machineconfigpools labeled with requiredForUpgradeMachineConfigPoolLabelKey // have updated to the latest configuration. -func (optr *Operator) syncRequiredMachineConfigPools(_ *renderConfig) error { +func (optr *Operator) syncRequiredMachineConfigPools(config *renderConfig) error { var lastErr error ctx := context.TODO() @@ -1437,6 +1439,12 @@ func (optr *Operator) syncRequiredMachineConfigPools(_ *renderConfig) error { if err := optr.syncMetrics(); err != nil { return false, err } + // This was needed in-case the cluster is mid-update when a new MachineConfiguration was applied. + // This prevents the need to wait for all the master nodes to update before the MachineConfiguration + // status is updated. + if err := optr.syncMachineConfiguration(config); err != nil { + return false, err + } if lastErr != nil { co, err := optr.fetchClusterOperator() if err != nil { @@ -1960,11 +1968,13 @@ func cmToData(cm *corev1.ConfigMap, key string) ([]byte, error) { func (optr *Operator) syncMachineConfiguration(_ *renderConfig) error { // Grab the cluster CR - _, err := optr.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) + mcop, err := optr.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) if err != nil { // Create one if it doesn't exist if apierrors.IsNotFound(err) { klog.Info("MachineConfiguration object doesn't exist; a new one will be created") + // Using server-side apply here as the NodeDisruption API has a rule technicality which prevents apply using a template manifest like the MCO typically does + // [spec.nodeDisruptionPolicy.sshkey.actions: Required value, : Invalid value: "null"] p := mcoac.MachineConfiguration(ctrlcommon.MCOOperatorKnobsObjectName).WithSpec(mcoac.MachineConfigurationSpec().WithManagementState("Managed")) _, err := optr.mcopClient.OperatorV1().MachineConfigurations().Apply(context.TODO(), p, metav1.ApplyOptions{FieldManager: "machine-config-operator"}) if err != nil { @@ -1988,7 +1998,21 @@ func (optr *Operator) syncMachineConfiguration(_ *renderConfig) error { return nil } - // Do additional processing for feature gate related fields here + // Merges the cluster's default node disruption policies with the user defined policies, if any. + newNodeDisruptionPolicyStatus := opv1.NodeDisruptionPolicyStatus{ + ClusterPolicies: apihelpers.MergeClusterPolicies(mcop.Spec.NodeDisruptionPolicy), + } + + // Check if any changes are required in the Status before making the API call. + if !reflect.DeepEqual(mcop.Status.NodeDisruptionPolicyStatus, newNodeDisruptionPolicyStatus) { + klog.Infof("Updating NodeDisruptionPolicy status") + mcop.Status.NodeDisruptionPolicyStatus = newNodeDisruptionPolicyStatus + _, err = optr.mcopClient.OperatorV1().MachineConfigurations().UpdateStatus(context.TODO(), mcop, metav1.UpdateOptions{}) + if err != nil { + klog.Errorf("NodeDisruptionPolicy status apply failed: %v", err) + return nil + } + } return nil }