Skip to content
66 changes: 40 additions & 26 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,10 @@ func (dn *Daemon) checkStateOnFirstRun() error {
if err := dn.drain(); err != nil {
return err
}
return dn.finalizeAndReboot(state.pendingConfig)
if err := dn.finalizeBeforeReboot(state.pendingConfig); err != nil {
return err
}
return dn.reboot(fmt.Sprintf("Node will reboot into config %v", state.pendingConfig.GetName()))
}

if err := dn.detectEarlySSHAccessesFromBoot(); err != nil {
Expand Down Expand Up @@ -1043,7 +1046,10 @@ func (dn *Daemon) checkStateOnFirstRun() error {
if err := os.RemoveAll(osImageContentDir); err != nil {
return err
}
return dn.finalizeAndReboot(state.currentConfig)
if err := dn.finalizeBeforeReboot(state.currentConfig); err != nil {
return err
}
return dn.reboot(fmt.Sprintf("Node will reboot into config %v", state.currentConfig.GetName()))
}
glog.Info("No bootstrap pivot required; unlinking bootstrap node annotations")

Expand Down Expand Up @@ -1097,39 +1103,53 @@ func (dn *Daemon) checkStateOnFirstRun() error {
glog.Info("Validated on-disk state")
} else {
glog.Infof("Skipping on-disk validation; %s present", constants.MachineConfigDaemonForceFile)
if err := os.Remove(constants.MachineConfigDaemonForceFile); err != nil {
return errors.Wrap(err, "failed to remove force validation file")
}
return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig)
}

// We've validated our state. In the case where we had a pendingConfig,
// make that now currentConfig. We update the node annotation, delete the
// state file, etc.
//
// However, it may be the case that desiredConfig changed while we
// were coming up, so we next look at that before uncordoning the node (so
// we don't uncordon and then immediately re-cordon)
// We've validated state. Now, ensure that node is in desired state
var inDesiredConfig bool
if inDesiredConfig, err = dn.updateConfigAndState(state); err != nil {
return err
}
if inDesiredConfig {
return nil
}

if dn.recorder != nil {
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "BootResync", fmt.Sprintf("Booting node %s, currentConfig %s, desiredConfig %s", dn.node.Name, state.currentConfig.GetName(), state.desiredConfig.GetName()))
}
// currentConfig != desiredConfig, and we're not booting up into the desiredConfig.
// Kick off an update.
return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig)
}

// updateConfigAndState updates node to desired state, labels nodes as done and uncordon
func (dn *Daemon) updateConfigAndState(state *stateAndConfigs) (bool, error) {
// In the case where we had a pendingConfig, make that now currentConfig.
// We update the node annotation, delete the state file, etc.
if state.pendingConfig != nil {
if dn.recorder != nil {
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "NodeDone", fmt.Sprintf("Setting node %s, currentConfig %s to Done", dn.node.Name, state.pendingConfig.GetName()))
}
if err := dn.nodeWriter.SetDone(dn.kubeClient.CoreV1().Nodes(), dn.nodeLister, dn.name, state.pendingConfig.GetName()); err != nil {
return errors.Wrap(err, "error setting node's state to Done")
return true, errors.Wrap(err, "error setting node's state to Done")
}
if out, err := dn.storePendingState(state.pendingConfig, 0); err != nil {
return errors.Wrapf(err, "failed to reset pending config: %s", string(out))
return true, errors.Wrapf(err, "failed to reset pending config: %s", string(out))
}

state.currentConfig = state.pendingConfig
}

if state.bootstrapping {
if err := dn.storeCurrentConfigOnDisk(state.currentConfig); err != nil {
return err
return false, err
}
}

// In case of node reboot, it may be the case that desiredConfig changed while we
// were coming up, so we next look at that before uncordoning the node (so
// we don't uncordon and then immediately re-cordon)
inDesiredConfig := state.currentConfig.GetName() == state.desiredConfig.GetName()
if inDesiredConfig {
if state.pendingConfig != nil {
Expand All @@ -1138,7 +1158,7 @@ func (dn *Daemon) checkStateOnFirstRun() error {
glog.Infof("Completing pending config %s", state.pendingConfig.GetName())
if err := dn.completeUpdate(dn.node, state.pendingConfig.GetName()); err != nil {
MCDUpdateState.WithLabelValues("", err.Error()).SetToCurrentTime()
return err
return inDesiredConfig, err
}
}
// If we're degraded here, it means we got an error likely on startup and we retried.
Expand All @@ -1147,22 +1167,16 @@ func (dn *Daemon) checkStateOnFirstRun() error {
if err := dn.nodeWriter.SetDone(dn.kubeClient.CoreV1().Nodes(), dn.nodeLister, dn.name, state.currentConfig.GetName()); err != nil {
errLabelStr := fmt.Sprintf("error setting node's state to Done: %v", err)
MCDUpdateState.WithLabelValues("", errLabelStr).SetToCurrentTime()
return errors.Wrap(err, "error setting node's state to Done")
return inDesiredConfig, errors.Wrap(err, "error setting node's state to Done")
}
}

glog.Infof("In desired config %s", state.currentConfig.GetName())
MCDUpdateState.WithLabelValues(state.currentConfig.GetName(), "").SetToCurrentTime()

// All good!
return nil
}
if dn.recorder != nil {
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "BootResync", fmt.Sprintf("Booting node %s, currentConfig %s, desiredConfig %s", dn.node.Name, state.currentConfig.GetName(), state.desiredConfig.GetName()))
}
// currentConfig != desiredConfig, and we're not booting up into the desiredConfig.
// Kick off an update.
return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig)

// No errors have occurred. Returns true if currentConfig == desiredConfig, false otherwise (needs update)
return inDesiredConfig, nil
}

// runOnceFromMachineConfig utilizes a parsed machineConfig and executes in onceFrom
Expand Down
186 changes: 180 additions & 6 deletions pkg/daemon/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,16 @@ const (
fipsFile = "/proc/sys/crypto/fips_enabled"
extensionsRepo = "/etc/yum.repos.d/coreos-extensions.repo"
osImageContentBaseDir = "/run/mco-machine-os-content/"

// These are the actions for a node to take after applying config changes. (e.g. a new machineconfig is applied)
// "None" means no special action needs to be taken. A drain will still happen.
// This currently happens when ssh keys or pull secret (/var/lib/kubelet/config.json) is changed
postConfigChangeActionNone = "none"
// Rebooting is still the default scenario for any other change
postConfigChangeActionReboot = "reboot"
// Crio reload will happen when /etc/containers/registries.conf is changed. This will cause
// a "systemctl reload crio"
postConfigChangeActionReloadCrio = "reload crio"
)

func writeFileAtomicallyWithDefaults(fpath string, b []byte) error {
Expand Down Expand Up @@ -93,9 +103,70 @@ func getNodeRef(node *corev1.Node) *corev1.ObjectReference {
}
}

// finalizeAndReboot is the last step in an update(), and it can also
// be called as a special case for the "bootstrap pivot".
func (dn *Daemon) finalizeAndReboot(newConfig *mcfgv1.MachineConfig) (retErr error) {
func reloadService(name string) error {
_, err := runGetOut("systemctl", "reload", name)
return err
}

// performPostConfigChangeAction takes action based on what postConfigChangeAction has been asked.
// For non-reboot action, it applies configuration, updates node's config and state.
// In the end uncordon node to schedule workload.
// If at any point an error occurs, we reboot the node so that node has correct configuration.
func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string, configName string) error {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wondering if there should be an event here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, as in, add events for non-reboot scenarios? (We do have reboot events I believe)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes exactly wondering if they are covered sufficiently (sorry for not being clear!)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(i can take or leave this was just wondering)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some in the specific scenarios below:
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Reboot", "Config changes do not require reboot.")
and
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Reboot", "Config changes do not require reboot. Service %s was reloaded.", serviceName)

Should I also add some warnings for the failure cases? Or what would be a good way to handle those (they would get a reboot event regardless)

Copy link
Contributor

@kikisdeliveryservice kikisdeliveryservice Dec 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have events for failed drain and fails to reconcile for ex:
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedToDrain", failMsg)

maybe a similar one near?: dn.logSystem("Reloading %s configuration failed, node will reboot instead. Error: %v", serviceName, err)

Up to you tho, I don't feel that strongly

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean this line: dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedToDrain", failMsg) ?
So basically for what I added, instead of Normal I added it as a warning instead?

Copy link
Contributor

@kikisdeliveryservice kikisdeliveryservice Dec 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh duh yes sorry for the bad copypasta, yes that is the example, and yeah i thought maaybe put a warning near the reloading failed logging you already have to show it failed (below on line 129)

if ctrlcommon.InSlice(postConfigChangeActionReboot, postConfigChangeActions) {
dn.logSystem("Rebooting node")
return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName))
}

if ctrlcommon.InSlice(postConfigChangeActionNone, postConfigChangeActions) {
if dn.recorder != nil {
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "SkipReboot", "Config changes do not require reboot.")
}
dn.logSystem("Node has Desired Config %s, skipping reboot", configName)
}

if ctrlcommon.InSlice(postConfigChangeActionReloadCrio, postConfigChangeActions) {
serviceName := "crio"

if err := reloadService(serviceName); err != nil {
if dn.recorder != nil {
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedServiceReload", fmt.Sprintf("Reloading %s service failed. Error: %v", serviceName, err))
}
dn.logSystem("Reloading %s configuration failed, node will reboot instead. Error: %v", serviceName, err)
dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName))
}

if dn.recorder != nil {
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "SkipReboot", "Config changes do not require reboot. Service %s was reloaded.", serviceName)
}
dn.logSystem("%s config reloaded successfully! Desired config %s has been applied, skipping reboot", serviceName, configName)
}

// We are here, which means reboot was not needed to apply the configuration.

// Get current state of node, in case of an error reboot
state, err := dn.getStateAndConfigs(configName)
if err != nil {
glog.Errorf("Error processing state and configs, node will reboot instead. Error: %v", err)
return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName))
}

var inDesiredConfig bool
if inDesiredConfig, err = dn.updateConfigAndState(state); err != nil {
glog.Errorf("Setting node's state to Done failed, node will reboot instead. Error: %v", err)
return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName))
}
if inDesiredConfig {
return nil
}

// currentConfig != desiredConfig, kick off an update
return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig)
}

// finalizeBeforeReboot is the last step in an update() and then we take appropriate postConfigChangeAction.
// It can also be called as a special case for the "bootstrap pivot".
func (dn *Daemon) finalizeBeforeReboot(newConfig *mcfgv1.MachineConfig) (retErr error) {
if out, err := dn.storePendingState(newConfig, 1); err != nil {
return errors.Wrapf(err, "failed to log pending config: %s", string(out))
}
Expand All @@ -114,8 +185,7 @@ func (dn *Daemon) finalizeAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "PendingConfig", fmt.Sprintf("Written pending config %s", newConfig.GetName()))
}

// reboot. this function shouldn't actually return.
return dn.reboot(fmt.Sprintf("Node will reboot into config %v", newConfig.GetName()))
return nil
}

func (dn *Daemon) drain() error {
Expand Down Expand Up @@ -394,6 +464,101 @@ func (dn *Daemon) applyOSChanges(oldConfig, newConfig *mcfgv1.MachineConfig) (re

}

func calculatePostConfigChangeActionFromFileDiffs(oldIgnConfig, newIgnConfig ign3types.Config) (actions []string) {
filesPostConfigChangeActionNone := []string{
"/var/lib/kubelet/config.json",
}
filesPostConfigChangeActionReloadCrio := []string{
"/etc/containers/registries.conf",
}

oldFileSet := make(map[string]ign3types.File)
for _, f := range oldIgnConfig.Storage.Files {
oldFileSet[f.Path] = f
}
newFileSet := make(map[string]ign3types.File)
for _, f := range newIgnConfig.Storage.Files {
newFileSet[f.Path] = f
}
diffFileSet := []string{}

// First check if any files were removed
for path := range oldFileSet {
_, ok := newFileSet[path]
if !ok {
// debug: remove
glog.Infof("File diff: %v was deleted", path)
diffFileSet = append(diffFileSet, path)
}
}

// Now check if any files were added/changed
for path, newFile := range newFileSet {
oldFile, ok := oldFileSet[path]
if !ok {
// debug: remove
glog.Infof("File diff: %v was added", path)
diffFileSet = append(diffFileSet, path)
} else if !reflect.DeepEqual(oldFile, newFile) {
// debug: remove
glog.Infof("File diff: detected change to %v", newFile.Path)
diffFileSet = append(diffFileSet, path)
}
}

// Now calculate action
for _, k := range diffFileSet {
if ctrlcommon.InSlice(k, filesPostConfigChangeActionNone) {
continue
} else if ctrlcommon.InSlice(k, filesPostConfigChangeActionReloadCrio) {
actions = []string{postConfigChangeActionReloadCrio}
continue
} else {
actions = []string{postConfigChangeActionReboot}
break
}
}

if len(actions) == 0 {
actions = []string{postConfigChangeActionNone}
}
return
}

func calculatePostConfigChangeAction(oldConfig, newConfig *mcfgv1.MachineConfig) ([]string, error) {
// If a machine-config-daemon-force file is present, it means the user wants to
// move to desired state without additional validation. We will reboot the node in
// this case regardless of what MachineConfig diff is.
if _, err := os.Stat(constants.MachineConfigDaemonForceFile); err == nil {
if err := os.Remove(constants.MachineConfigDaemonForceFile); err != nil {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this does change the behaviour of how the forcefile works slightly, but I guess its probably for the better, since this way it stays on the system until an update happens. Off the top of my head I don't think this should cause a problem. So +1

return []string{}, errors.Wrap(err, "failed to remove force validation file")
}
glog.Infof("Setting post config change action to postConfigChangeActionReboot; %s present", constants.MachineConfigDaemonForceFile)
return []string{postConfigChangeActionReboot}, nil
}

diff, err := newMachineConfigDiff(oldConfig, newConfig)
if err != nil {
return []string{}, err
}
if diff.osUpdate || diff.kargs || diff.fips || diff.units || diff.kernelType || diff.extensions {
// must reboot
return []string{postConfigChangeActionReboot}, nil
}

oldIgnConfig, err := ctrlcommon.ParseAndConvertConfig(oldConfig.Spec.Config.Raw)
if err != nil {
return []string{}, err
}
newIgnConfig, err := ctrlcommon.ParseAndConvertConfig(newConfig.Spec.Config.Raw)
if err != nil {
return []string{}, err
}

// We don't actually have to consider ssh keys changes, which is the only section of passwd that is allowed to change
return calculatePostConfigChangeActionFromFileDiffs(oldIgnConfig, newIgnConfig), nil
}

// update the node to the provided node configuration.
func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr error) {
oldConfig = canonicalizeEmptyMC(oldConfig)
Expand Down Expand Up @@ -440,6 +605,11 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err

dn.logSystem("Starting update from %s to %s: %+v", oldConfigName, newConfigName, diff)

actions, err := calculatePostConfigChangeAction(oldConfig, newConfig)
if err != nil {
return err
}

if err := dn.drain(); err != nil {
return err
}
Expand Down Expand Up @@ -515,7 +685,11 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err
glog.Info("Updated kernel tuning arguments")
}

return dn.finalizeAndReboot(newConfig)
if err := dn.finalizeBeforeReboot(newConfig); err != nil {
return err
}

return dn.performPostConfigChangeAction(actions, newConfig.GetName())
}

// removeRollback removes the rpm-ostree rollback deployment. It
Expand Down
Loading