From 84362bf4939063a01b315f3ee7909d2d37f9c1e4 Mon Sep 17 00:00:00 2001 From: Sinny Kumari Date: Tue, 24 Nov 2020 17:20:01 +0100 Subject: [PATCH 1/8] daemon: skip or perform node reboot based on rebootAction Today MCO defaults to rebooting node in order to apply successfully changes to the node. This is the safest way to apply a change on the node as we don't have to worry about things like which change is safe to skip reboot. In certain environments, rebooting is expensive and due to complex hardware setup sometimes can cause problems and node won't boot up. This will help to avoid rebooting nodes for certain cases where MCO thinks it is safe to do so. See - https://github.com/openshift/enhancements/pull/159 --- pkg/daemon/daemon.go | 71 ++++++++++++++++++++++++++++---------------- pkg/daemon/update.go | 70 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 109 insertions(+), 32 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 42336a6e25..12b08ae2ec 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -165,6 +165,14 @@ const ( onceFromRemoteConfig ) +type rebootAction int + +const ( + rebootActionReboot rebootAction = iota + rebootActionNone + rebootActionReloadCrio +) + var ( defaultRebootTimeout = 24 * time.Hour ) @@ -1015,7 +1023,10 @@ func (dn *Daemon) checkStateOnFirstRun() error { if err := dn.drain(); err != nil { return err } - return dn.finalizeAndReboot(state.pendingConfig) + if err := dn.finalizeBeforeReboot(state.pendingConfig); err != nil { + return err + } + return dn.reboot(fmt.Sprintf("Node will reboot into config %v", state.pendingConfig.GetName())) } if err := dn.detectEarlySSHAccessesFromBoot(); err != nil { @@ -1043,7 +1054,10 @@ func (dn *Daemon) checkStateOnFirstRun() error { if err := os.RemoveAll(osImageContentDir); err != nil { return err } - return dn.finalizeAndReboot(state.currentConfig) + if err := dn.finalizeBeforeReboot(state.currentConfig); err != nil { + return err + } + return dn.reboot(fmt.Sprintf("Node will reboot into config %v", state.currentConfig.GetName())) } glog.Info("No bootstrap pivot required; unlinking bootstrap node annotations") @@ -1103,33 +1117,44 @@ func (dn *Daemon) checkStateOnFirstRun() error { return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig) } - // We've validated our state. In the case where we had a pendingConfig, - // make that now currentConfig. We update the node annotation, delete the - // state file, etc. - // - // However, it may be the case that desiredConfig changed while we - // were coming up, so we next look at that before uncordoning the node (so - // we don't uncordon and then immediately re-cordon) + // We've validated state. Now, ensure that node is in desired state + var inDesiredConfig bool + if inDesiredConfig, err = dn.updateConfigAndState(state); err != nil { + return err + } + if inDesiredConfig { + return nil + } + + if dn.recorder != nil { + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "BootResync", fmt.Sprintf("Booting node %s, currentConfig %s, desiredConfig %s", dn.node.Name, state.currentConfig.GetName(), state.desiredConfig.GetName())) + } + // currentConfig != desiredConfig, and we're not booting up into the desiredConfig. + // Kick off an update. + return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig) +} + +// updateConfigAndState updates node to desired state, labels nodes as done and uncordon +func (dn *Daemon) updateConfigAndState(state *stateAndConfigs) (bool, error) { + // In the case where we had a pendingConfig, make that now currentConfig. + // We update the node annotation, delete the state file, etc. if state.pendingConfig != nil { if dn.recorder != nil { dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "NodeDone", fmt.Sprintf("Setting node %s, currentConfig %s to Done", dn.node.Name, state.pendingConfig.GetName())) } if err := dn.nodeWriter.SetDone(dn.kubeClient.CoreV1().Nodes(), dn.nodeLister, dn.name, state.pendingConfig.GetName()); err != nil { - return errors.Wrap(err, "error setting node's state to Done") + return true, errors.Wrap(err, "error setting node's state to Done") } if out, err := dn.storePendingState(state.pendingConfig, 0); err != nil { - return errors.Wrapf(err, "failed to reset pending config: %s", string(out)) + return true, errors.Wrapf(err, "failed to reset pending config: %s", string(out)) } state.currentConfig = state.pendingConfig } - if state.bootstrapping { - if err := dn.storeCurrentConfigOnDisk(state.currentConfig); err != nil { - return err - } - } - + // In case of node reboot, it may be the case that desiredConfig changed while we + // were coming up, so we next look at that before uncordoning the node (so + // we don't uncordon and then immediately re-cordon) inDesiredConfig := state.currentConfig.GetName() == state.desiredConfig.GetName() if inDesiredConfig { if state.pendingConfig != nil { @@ -1138,7 +1163,7 @@ func (dn *Daemon) checkStateOnFirstRun() error { glog.Infof("Completing pending config %s", state.pendingConfig.GetName()) if err := dn.completeUpdate(dn.node, state.pendingConfig.GetName()); err != nil { MCDUpdateState.WithLabelValues("", err.Error()).SetToCurrentTime() - return err + return inDesiredConfig, err } } // If we're degraded here, it means we got an error likely on startup and we retried. @@ -1147,7 +1172,7 @@ func (dn *Daemon) checkStateOnFirstRun() error { if err := dn.nodeWriter.SetDone(dn.kubeClient.CoreV1().Nodes(), dn.nodeLister, dn.name, state.currentConfig.GetName()); err != nil { errLabelStr := fmt.Sprintf("error setting node's state to Done: %v", err) MCDUpdateState.WithLabelValues("", errLabelStr).SetToCurrentTime() - return errors.Wrap(err, "error setting node's state to Done") + return inDesiredConfig, errors.Wrap(err, "error setting node's state to Done") } } @@ -1155,14 +1180,8 @@ func (dn *Daemon) checkStateOnFirstRun() error { MCDUpdateState.WithLabelValues(state.currentConfig.GetName(), "").SetToCurrentTime() // All good! - return nil - } - if dn.recorder != nil { - dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "BootResync", fmt.Sprintf("Booting node %s, currentConfig %s, desiredConfig %s", dn.node.Name, state.currentConfig.GetName(), state.desiredConfig.GetName())) } - // currentConfig != desiredConfig, and we're not booting up into the desiredConfig. - // Kick off an update. - return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig) + return inDesiredConfig, nil } // runOnceFromMachineConfig utilizes a parsed machineConfig and executes in onceFrom diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 15ff2512fa..692e867002 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -93,9 +93,56 @@ func getNodeRef(node *corev1.Node) *corev1.ObjectReference { } } -// finalizeAndReboot is the last step in an update(), and it can also -// be called as a special case for the "bootstrap pivot". -func (dn *Daemon) finalizeAndReboot(newConfig *mcfgv1.MachineConfig) (retErr error) { +func reloadCrioConfig() error { + _, err := runGetOut("pkill", "-HUP", "crio") + return err +} + +// performRebootAction takes action based on what rebootAction has been asked. +// For non-reboot action, it applies configuration, updates node's config and state. +// In the end uncordon node to schedule workload. +// If at any point an error occurs, we reboot the node so that node has correct configuration. +func (dn *Daemon) performRebootAction(action rebootAction, configName string) error { + switch action { + case rebootActionNone: + dn.logSystem("Node has Desired Config %s, skipping reboot", configName) + case rebootActionReloadCrio: + if err := reloadCrioConfig(); err != nil { + dn.logSystem("Reloading crio configuration failed, rebooting: %v", err) + dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) + } + dn.logSystem("crio config reloaded successfully! Desired config %s has been applied, skipping reboot", configName) + default: + // Defaults to rebooting node + dn.logSystem("Rebooting node") + return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) + } + + // We are here, which means reboot was not needed to apply the configuration. + + // Get current state of node, in case of an error reboot + state, err := dn.getStateAndConfigs(configName) + if err != nil { + glog.Errorf("Error processing state and configs, rebooting: %v", err) + return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) + } + + var inDesiredConfig bool + if inDesiredConfig, err = dn.updateConfigAndState(state); err != nil { + glog.Errorf("Setting node's state to Done failed, rebooting: %v", err) + return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) + } + if inDesiredConfig { + return nil + } + + // currentConfig != desiredConfig, kick off an update + return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig) +} + +// finalizeBeforeReboot is the last step in an update() and then we take appropriate rebootAction. +// It can also be called as a special case for the "bootstrap pivot". +func (dn *Daemon) finalizeBeforeReboot(newConfig *mcfgv1.MachineConfig) (retErr error) { if out, err := dn.storePendingState(newConfig, 1); err != nil { return errors.Wrapf(err, "failed to log pending config: %s", string(out)) } @@ -114,8 +161,7 @@ func (dn *Daemon) finalizeAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "PendingConfig", fmt.Sprintf("Written pending config %s", newConfig.GetName())) } - // reboot. this function shouldn't actually return. - return dn.reboot(fmt.Sprintf("Node will reboot into config %v", newConfig.GetName())) + return nil } func (dn *Daemon) drain() error { @@ -515,7 +561,19 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err glog.Info("Updated kernel tuning arguments") } - return dn.finalizeAndReboot(newConfig) + if err := dn.finalizeBeforeReboot(newConfig); err != nil { + return err + } + + // TODO: Need Jerry's work to determine exact reboot action + var action rebootAction + action = dn.getRebootAction() + return dn.performRebootAction(action, newConfig.GetName()) +} + +func (dn *Daemon) getRebootAction() rebootAction { + // Until we have logic, always reboot + return rebootActionReboot } // removeRollback removes the rpm-ostree rollback deployment. It From 81476d409b7143dc12a4830b9d1de870b4c07d0c Mon Sep 17 00:00:00 2001 From: Yu Qi Zhang Date: Wed, 25 Nov 2020 23:18:43 -0500 Subject: [PATCH 2/8] daemon: calculate config change action based on MC diffs Add functionality to calculate the set of diffs between the existing and new rendered config when an update happens, and take action based on the diff. Currently supported actions are: reboot - default behaviour reload crio - when registries.conf is changed none - when pull secret or ssh key is changed Also rename rebootAction to postConfigChangeActionNone, although that name is up for debate. --- pkg/daemon/daemon.go | 8 --- pkg/daemon/update.go | 128 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 109 insertions(+), 27 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 12b08ae2ec..4ae37c87a1 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -165,14 +165,6 @@ const ( onceFromRemoteConfig ) -type rebootAction int - -const ( - rebootActionReboot rebootAction = iota - rebootActionNone - rebootActionReloadCrio -) - var ( defaultRebootTimeout = 24 * time.Hour ) diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 692e867002..82cba647e8 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -48,6 +48,12 @@ const ( fipsFile = "/proc/sys/crypto/fips_enabled" extensionsRepo = "/etc/yum.repos.d/coreos-extensions.repo" osImageContentBaseDir = "/run/mco-machine-os-content/" + + // These are the actions for a node to take after applying config changes. + // Defaults to reboot. + postConfigChangeActionNone = "none" + postConfigChangeActionReboot = "reboot" + postConfigChangeActionReloadCrio = "reload crio" ) func writeFileAtomicallyWithDefaults(fpath string, b []byte) error { @@ -98,24 +104,24 @@ func reloadCrioConfig() error { return err } -// performRebootAction takes action based on what rebootAction has been asked. +// performPostConfigChangeAction takes action based on what postConfigChangeAction has been asked. // For non-reboot action, it applies configuration, updates node's config and state. // In the end uncordon node to schedule workload. // If at any point an error occurs, we reboot the node so that node has correct configuration. -func (dn *Daemon) performRebootAction(action rebootAction, configName string) error { - switch action { - case rebootActionNone: +func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string, configName string) error { + if ctrlcommon.InSlice(postConfigChangeActionReboot, postConfigChangeActions) { + dn.logSystem("Rebooting node") + return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) + } + if ctrlcommon.InSlice(postConfigChangeActionNone, postConfigChangeActions) { dn.logSystem("Node has Desired Config %s, skipping reboot", configName) - case rebootActionReloadCrio: + } + if ctrlcommon.InSlice(postConfigChangeActionReloadCrio, postConfigChangeActions) { if err := reloadCrioConfig(); err != nil { dn.logSystem("Reloading crio configuration failed, rebooting: %v", err) dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } dn.logSystem("crio config reloaded successfully! Desired config %s has been applied, skipping reboot", configName) - default: - // Defaults to rebooting node - dn.logSystem("Rebooting node") - return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } // We are here, which means reboot was not needed to apply the configuration. @@ -140,7 +146,7 @@ func (dn *Daemon) performRebootAction(action rebootAction, configName string) er return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig) } -// finalizeBeforeReboot is the last step in an update() and then we take appropriate rebootAction. +// finalizeBeforeReboot is the last step in an update() and then we take appropriate postConfigChangeAction. // It can also be called as a special case for the "bootstrap pivot". func (dn *Daemon) finalizeBeforeReboot(newConfig *mcfgv1.MachineConfig) (retErr error) { if out, err := dn.storePendingState(newConfig, 1); err != nil { @@ -440,6 +446,91 @@ func (dn *Daemon) applyOSChanges(oldConfig, newConfig *mcfgv1.MachineConfig) (re } +func calculatePostConfigChangeActionFromFileDiffs(oldIgnConfig, newIgnConfig ign3types.Config) (actions []string) { + filesPostConfigChangeActionNone := []string{ + "/var/lib/kubelet/config.json", + } + filesPostConfigChangeActionReloadCrio := []string{ + "/etc/containers/registries.conf", + } + + oldFileSet := make(map[string]ign3types.File) + for _, f := range oldIgnConfig.Storage.Files { + oldFileSet[f.Path] = f + } + newFileSet := make(map[string]ign3types.File) + for _, f := range newIgnConfig.Storage.Files { + newFileSet[f.Path] = f + } + diffFileSet := []string{} + + // First check if any files were removed + for path := range oldFileSet { + _, ok := newFileSet[path] + if !ok { + // debug: remove + glog.Infof("File diff: %v was deleted", path) + diffFileSet = append(diffFileSet, path) + } + } + + // Now check if any files were added/changed + for path, newFile := range newFileSet { + oldFile, ok := oldFileSet[path] + if !ok { + // debug: remove + glog.Infof("File diff: %v was added", path) + diffFileSet = append(diffFileSet, path) + } + if !reflect.DeepEqual(oldFile, newFile) { + // debug: remove + glog.Infof("File diff: detected change to %v", newFile.Path) + diffFileSet = append(diffFileSet, path) + } + } + + // Now calculate action + for _, k := range diffFileSet { + if ctrlcommon.InSlice(k, filesPostConfigChangeActionNone) { + continue + } else if ctrlcommon.InSlice(k, filesPostConfigChangeActionReloadCrio) { + actions = []string{postConfigChangeActionReloadCrio} + continue + } else { + actions = []string{postConfigChangeActionReboot} + break + } + } + + if len(actions) == 0 { + actions = []string{postConfigChangeActionNone} + } + return +} + +func calculatePostConfigChangeAction(oldConfig, newConfig *mcfgv1.MachineConfig) ([]string, error) { + diff, err := newMachineConfigDiff(oldConfig, newConfig) + if err != nil { + return []string{}, err + } + if diff.osUpdate || diff.kargs || diff.fips || diff.units || diff.kernelType || diff.extensions { + // must reboot + return []string{postConfigChangeActionReboot}, nil + } + + oldIgnConfig, err := ctrlcommon.ParseAndConvertConfig(oldConfig.Spec.Config.Raw) + if err != nil { + return []string{}, err + } + newIgnConfig, err := ctrlcommon.ParseAndConvertConfig(newConfig.Spec.Config.Raw) + if err != nil { + return []string{}, err + } + + // We don't actually have to consider ssh keys changes, which is the only section of passwd that is allowed to change + return calculatePostConfigChangeActionFromFileDiffs(oldIgnConfig, newIgnConfig), nil +} + // update the node to the provided node configuration. func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr error) { oldConfig = canonicalizeEmptyMC(oldConfig) @@ -486,6 +577,13 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err dn.logSystem("Starting update from %s to %s: %+v", oldConfigName, newConfigName, diff) + // TODO: consider how we should honor "force" flag here. Maybe if force, always reboot? + // TODO: consider if we should not cordon if no action needs to be taken + actions, err := calculatePostConfigChangeAction(oldConfig, newConfig) + if err != nil { + return err + } + if err := dn.drain(); err != nil { return err } @@ -565,15 +663,7 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err return err } - // TODO: Need Jerry's work to determine exact reboot action - var action rebootAction - action = dn.getRebootAction() - return dn.performRebootAction(action, newConfig.GetName()) -} - -func (dn *Daemon) getRebootAction() rebootAction { - // Until we have logic, always reboot - return rebootActionReboot + return dn.performPostConfigChangeAction(actions, newConfig.GetName()) } // removeRollback removes the rpm-ostree rollback deployment. It From 45a1030d2b06a5c7e68a25c8baed1b3576247a9b Mon Sep 17 00:00:00 2001 From: Sinny Kumari Date: Thu, 26 Nov 2020 15:48:37 +0100 Subject: [PATCH 3/8] daemon: store currentConfig on disk during bootstrap deleted accidently while splitting checkStateOnFirstRun() --- pkg/daemon/daemon.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 4ae37c87a1..912864f79d 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -1144,6 +1144,12 @@ func (dn *Daemon) updateConfigAndState(state *stateAndConfigs) (bool, error) { state.currentConfig = state.pendingConfig } + if state.bootstrapping { + if err := dn.storeCurrentConfigOnDisk(state.currentConfig); err != nil { + return false, err + } + } + // In case of node reboot, it may be the case that desiredConfig changed while we // were coming up, so we next look at that before uncordoning the node (so // we don't uncordon and then immediately re-cordon) From e443af7ca0c17500ae9e786bde66314abb2e9a46 Mon Sep 17 00:00:00 2001 From: Yu Qi Zhang Date: Thu, 26 Nov 2020 18:17:39 -0500 Subject: [PATCH 4/8] Add unit test coverage for selective reboot cases Add unit tests for different changes in the MachineConfig, to test if the calculated post config change action is expected. --- pkg/daemon/update.go | 3 +- pkg/daemon/update_test.go | 139 ++++++++++++++++++++++++++++++++++++++ test/helpers/helpers.go | 43 +++++++++++- 3 files changed, 182 insertions(+), 3 deletions(-) diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 82cba647e8..51654037d6 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -481,8 +481,7 @@ func calculatePostConfigChangeActionFromFileDiffs(oldIgnConfig, newIgnConfig ign // debug: remove glog.Infof("File diff: %v was added", path) diffFileSet = append(diffFileSet, path) - } - if !reflect.DeepEqual(oldFile, newFile) { + } else if !reflect.DeepEqual(oldFile, newFile) { // debug: remove glog.Infof("File diff: detected change to %v", newFile.Path) diffFileSet = append(diffFileSet, path) diff --git a/pkg/daemon/update_test.go b/pkg/daemon/update_test.go index 5d8aca03cd..0091499c06 100644 --- a/pkg/daemon/update_test.go +++ b/pkg/daemon/update_test.go @@ -12,6 +12,7 @@ import ( ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" "github.com/openshift/machine-config-operator/test/helpers" "github.com/stretchr/testify/assert" + "github.com/vincent-petithory/dataurl" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sfake "k8s.io/client-go/kubernetes/fake" ) @@ -469,6 +470,144 @@ func TestDropinCheck(t *testing.T) { } } +// Test to see if the correct action is calculated given a machineconfig diff +// i.e. whether we need to reboot and what actions need to be taken if no reboot is needed +func TestCalculatePostConfigChangeAction(t *testing.T) { + files := map[string]ign3types.File{ + "pullsecret1": ign3types.File{ + Node: ign3types.Node{ + Path: "/var/lib/kubelet/config.json", + }, + FileEmbedded1: ign3types.FileEmbedded1{ + Contents: ign3types.Resource{ + Source: helpers.StrToPtr(dataurl.EncodeBytes([]byte("kubelet conf 1\n"))), + }, + }, + }, + "pullsecret2": ign3types.File{ + Node: ign3types.Node{ + Path: "/var/lib/kubelet/config.json", + }, + FileEmbedded1: ign3types.FileEmbedded1{ + Contents: ign3types.Resource{ + Source: helpers.StrToPtr(dataurl.EncodeBytes([]byte("kubelet conf 2\n"))), + }, + }, + }, + "registries1": ign3types.File{ + Node: ign3types.Node{ + Path: "/etc/containers/registries.conf", + }, + FileEmbedded1: ign3types.FileEmbedded1{ + Contents: ign3types.Resource{ + Source: helpers.StrToPtr(dataurl.EncodeBytes([]byte("registries content 1\n"))), + }, + }, + }, + "registries2": ign3types.File{ + Node: ign3types.Node{ + Path: "/etc/containers/registries.conf", + }, + FileEmbedded1: ign3types.FileEmbedded1{ + Contents: ign3types.Resource{ + Source: helpers.StrToPtr(dataurl.EncodeBytes([]byte("registries content 2\n"))), + }, + }, + }, + "randomfile1": ign3types.File{ + Node: ign3types.Node{ + Path: "/etc/random-reboot-file", + }, + FileEmbedded1: ign3types.FileEmbedded1{ + Contents: ign3types.Resource{ + Source: helpers.StrToPtr(dataurl.EncodeBytes([]byte("test\n"))), + }, + }, + }, + "randomfile2": ign3types.File{ + Node: ign3types.Node{ + Path: "/etc/random-reboot-file", + }, + FileEmbedded1: ign3types.FileEmbedded1{ + Contents: ign3types.Resource{ + Source: helpers.StrToPtr(dataurl.EncodeBytes([]byte("test 2\n"))), + }, + }, + }, + } + + tests := []struct { + oldConfig *mcfgv1.MachineConfig + newConfig *mcfgv1.MachineConfig + expectedAction []string + }{ + { + // test that a normal file change is reboot + oldConfig: helpers.NewMachineConfig("00-test", nil, "dummy://", []ign3types.File{files["randomfile1"]}), + newConfig: helpers.NewMachineConfig("01-test", nil, "dummy://", []ign3types.File{files["randomfile2"]}), + expectedAction: []string{postConfigChangeActionReboot}, + }, + { + // test that a pull secret change is none + oldConfig: helpers.NewMachineConfig("00-test", nil, "dummy://", []ign3types.File{files["pullsecret1"]}), + newConfig: helpers.NewMachineConfig("01-test", nil, "dummy://", []ign3types.File{files["pullsecret2"]}), + expectedAction: []string{postConfigChangeActionNone}, + }, + { + // test that a SSH key change is none + oldConfig: helpers.NewMachineConfigExtended("00-test", nil, []ign3types.File{}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key1"}, []string{}, false, []string{}, "default", "dummy://"), + newConfig: helpers.NewMachineConfigExtended("01-test", nil, []ign3types.File{}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key2"}, []string{}, false, []string{}, "default", "dummy://"), + expectedAction: []string{postConfigChangeActionNone}, + }, + { + // test that a registries change is reload + oldConfig: helpers.NewMachineConfig("00-test", nil, "dummy://", []ign3types.File{files["registries1"]}), + newConfig: helpers.NewMachineConfig("01-test", nil, "dummy://", []ign3types.File{files["registries2"]}), + expectedAction: []string{postConfigChangeActionReloadCrio}, + }, + { + // test that a registries change (reload) overwrites pull secret (none) + oldConfig: helpers.NewMachineConfig("00-test", nil, "dummy://", []ign3types.File{files["registries1"], files["pullsecret1"]}), + newConfig: helpers.NewMachineConfig("01-test", nil, "dummy://", []ign3types.File{files["registries2"], files["pullsecret2"]}), + expectedAction: []string{postConfigChangeActionReloadCrio}, + }, + { + // test that a osImage change (reboot) overwrites registries (reload) and SSH keys (none) + oldConfig: helpers.NewMachineConfigExtended("00-test", nil, []ign3types.File{files["registries1"]}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key1"}, []string{}, false, []string{}, "default", "dummy://"), + newConfig: helpers.NewMachineConfigExtended("01-test", nil, []ign3types.File{files["registries2"]}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key2"}, []string{}, false, []string{}, "default", "dummy1://"), + expectedAction: []string{postConfigChangeActionReboot}, + }, + { + // test that adding a pull secret is none + oldConfig: helpers.NewMachineConfigExtended("00-test", nil, []ign3types.File{files["registries1"]}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key1"}, []string{}, false, []string{}, "default", "dummy://"), + newConfig: helpers.NewMachineConfigExtended("01-test", nil, []ign3types.File{files["registries1"], files["pullsecret2"]}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key1"}, []string{}, false, []string{}, "default", "dummy://"), + expectedAction: []string{postConfigChangeActionNone}, + }, + { + // test that removing a registries is crio reload + oldConfig: helpers.NewMachineConfigExtended("00-test", nil, []ign3types.File{files["randomfile1"], files["registries1"]}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key1"}, []string{}, false, []string{}, "default", "dummy://"), + newConfig: helpers.NewMachineConfigExtended("01-test", nil, []ign3types.File{files["randomfile1"]}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key1"}, []string{}, false, []string{}, "default", "dummy://"), + expectedAction: []string{postConfigChangeActionReloadCrio}, + }, + { + // mixed test - final should be reboot due to kargs changes + oldConfig: helpers.NewMachineConfigExtended("00-test", nil, []ign3types.File{files["registries1"]}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key1"}, []string{}, false, []string{}, "default", "dummy://"), + newConfig: helpers.NewMachineConfigExtended("01-test", nil, []ign3types.File{files["pullsecret2"]}, []ign3types.Unit{}, []ign3types.SSHAuthorizedKey{"key2"}, []string{}, false, []string{"karg1"}, "default", "dummy://"), + expectedAction: []string{postConfigChangeActionReboot}, + }, + } + + for idx, test := range tests { + t.Run(fmt.Sprintf("case#%d", idx), func(t *testing.T) { + calculatedAction, err := calculatePostConfigChangeAction(test.oldConfig, test.newConfig) + + if !reflect.DeepEqual(test.expectedAction, calculatedAction) { + t.Errorf("Failed calculating config change action: expected: %v but result is: %v. Error: %v", test.expectedAction, calculatedAction, err) + } + }) + } +} + // checkReconcilableResults is a shortcut for verifying results that should be reconcilable func checkReconcilableResults(t *testing.T, key string, reconcilableError error) { if reconcilableError != nil { diff --git a/test/helpers/helpers.go b/test/helpers/helpers.go index 4975a16a17..680b6bd8a6 100644 --- a/test/helpers/helpers.go +++ b/test/helpers/helpers.go @@ -35,6 +35,32 @@ func BoolToPtr(b bool) *bool { // NewMachineConfig returns a basic machine config with supplied labels, osurl & files added func NewMachineConfig(name string, labels map[string]string, osurl string, files []ign3types.File) *mcfgv1.MachineConfig { + return NewMachineConfigExtended( + name, + labels, + files, + []ign3types.Unit{}, + []ign3types.SSHAuthorizedKey{}, + []string{}, + false, + []string{}, + "", + osurl, + ) +} + +// NewMachineConfigExtended returns a more comprehensive machine config +func NewMachineConfigExtended( + name string, + labels map[string]string, + files []ign3types.File, + units []ign3types.Unit, + sshkeys []ign3types.SSHAuthorizedKey, + extensions []string, + fips bool, + kernelArguments []string, + kernelType, osurl string, +) *mcfgv1.MachineConfig { if labels == nil { labels = map[string]string{} } @@ -46,6 +72,17 @@ func NewMachineConfig(name string, labels map[string]string, osurl string, files Storage: ign3types.Storage{ Files: files, }, + Systemd: ign3types.Systemd{ + Units: units, + }, + Passwd: ign3types.Passwd{ + Users: []ign3types.PasswdUser{ + { + Name: "core", + SSHAuthorizedKeys: sshkeys, + }, + }, + }, }, ) @@ -59,10 +96,14 @@ func NewMachineConfig(name string, labels map[string]string, osurl string, files UID: types.UID(utilrand.String(5)), }, Spec: mcfgv1.MachineConfigSpec{ - OSImageURL: osurl, Config: runtime.RawExtension{ Raw: rawIgnition, }, + Extensions: extensions, + FIPS: fips, + KernelArguments: kernelArguments, + KernelType: kernelType, + OSImageURL: osurl, }, } } From 85ac6c0766f7d46361f687d94bd060485c592e59 Mon Sep 17 00:00:00 2001 From: Sinny Kumari Date: Tue, 1 Dec 2020 12:57:38 +0100 Subject: [PATCH 5/8] daemon: use systemctl reload for crio config update --- pkg/daemon/update.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 51654037d6..7fb8f84e4a 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -99,8 +99,8 @@ func getNodeRef(node *corev1.Node) *corev1.ObjectReference { } } -func reloadCrioConfig() error { - _, err := runGetOut("pkill", "-HUP", "crio") +func reloadService(name string) error { + _, err := runGetOut("systemctl", "reload", name) return err } @@ -117,11 +117,12 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string dn.logSystem("Node has Desired Config %s, skipping reboot", configName) } if ctrlcommon.InSlice(postConfigChangeActionReloadCrio, postConfigChangeActions) { - if err := reloadCrioConfig(); err != nil { - dn.logSystem("Reloading crio configuration failed, rebooting: %v", err) + serviceName := "crio" + if err := reloadService(serviceName); err != nil { + dn.logSystem("Reloading %s configuration failed, rebooting: %v", serviceName, err) dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } - dn.logSystem("crio config reloaded successfully! Desired config %s has been applied, skipping reboot", configName) + dn.logSystem("%s config reloaded successfully! Desired config %s has been applied, skipping reboot", serviceName, configName) } // We are here, which means reboot was not needed to apply the configuration. From ffacb2d948be9fc091cf94f21ad1d274e312df60 Mon Sep 17 00:00:00 2001 From: Sinny Kumari Date: Mon, 30 Nov 2020 11:53:37 +0100 Subject: [PATCH 6/8] test/e2e: add e2e test for no reboot e2e test to apply a MachineConfig to add a ssh key as authorized key for user core. It ensures that the ssh key is added without rebooting node. Also, applied MachineConfig gets deleted without node reboot --- test/e2e/mcd_test.go | 124 +++++++++++++++++++++++++++++++++++++++++ test/e2e/utils_test.go | 16 ++++-- 2 files changed, 134 insertions(+), 6 deletions(-) diff --git a/test/e2e/mcd_test.go b/test/e2e/mcd_test.go index e2072e5eeb..ee4b83ffb6 100644 --- a/test/e2e/mcd_test.go +++ b/test/e2e/mcd_test.go @@ -3,6 +3,7 @@ package e2e_test import ( "context" "fmt" + "strconv" "strings" "testing" "time" @@ -325,6 +326,129 @@ func TestExtensions(t *testing.T) { require.Nil(t, err) } +func TestNoReboot(t *testing.T) { + cs := framework.NewClientSet("") + unlabelFunc := labelRandomNodeFromPool(t, cs, "worker", "node-role.kubernetes.io/infra") + oldInfraRenderedConfig := getMcName(t, cs, "infra") + + infraNode := getSingleNodeByRole(t, cs, "infra") + + output := execCmdOnNode(t, cs, infraNode, "cat", "/rootfs/proc/uptime") + t.Logf("Debug: Initial uptime file content %s", output) + oldTime := strings.Split(output, " ")[0] + t.Logf("Node %s initial uptime: %s", infraNode.Name, oldTime) + + // Adding authorized key for user core + testIgnConfig := ctrlcommon.NewIgnConfig() + testSSHKey := ign3types.PasswdUser{Name: "core", SSHAuthorizedKeys: []ign3types.SSHAuthorizedKey{"test adding authorized key without node reboot"}} + testIgnConfig.Passwd.Users = append(testIgnConfig.Passwd.Users, testSSHKey) + + addAuthorizedKey := &mcfgv1.MachineConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("authorzied-key-infra-%s", uuid.NewUUID()), + Labels: mcLabelForRole("infra"), + }, + Spec: mcfgv1.MachineConfigSpec{ + Config: runtime.RawExtension{ + Raw: helpers.MarshalOrDie(testIgnConfig), + }, + }, + } + + _, err := cs.MachineConfigs().Create(context.TODO(), addAuthorizedKey, metav1.CreateOptions{}) + require.Nil(t, err, "failed to create MC") + t.Logf("Created %s", addAuthorizedKey.Name) + + // grab the latest worker- MC + renderedConfig, err := waitForRenderedConfig(t, cs, "infra", addAuthorizedKey.Name) + require.Nil(t, err) + err = waitForPoolComplete(t, cs, "infra", renderedConfig) + require.Nil(t, err) + + // Re-fetch the infra node for updated annotations + infraNode = getSingleNodeByRole(t, cs, "infra") + + assert.Equal(t, infraNode.Annotations[constants.CurrentMachineConfigAnnotationKey], renderedConfig) + assert.Equal(t, infraNode.Annotations[constants.MachineConfigDaemonStateAnnotationKey], constants.MachineConfigDaemonStateDone) + + foundSSHKey := execCmdOnNode(t, cs, infraNode, "cat", "/rootfs/home/core/.ssh/authorized_keys") + if !strings.Contains(foundSSHKey, "test adding authorized key without node reboot") { + t.Fatalf("updated ssh keys not found in authorized_keys, got %s", foundSSHKey) + } + t.Logf("Node %s has SSH key", infraNode.Name) + + output = execCmdOnNode(t, cs, infraNode, "cat", "/rootfs/proc/uptime") + newTime := strings.Split(output, " ")[0] + + // To ensure we didn't reboot, new uptime should be greater than old uptime + uptimeOld, err := strconv.ParseFloat(oldTime, 64) + require.Nil(t, err) + + uptimeNew, err := strconv.ParseFloat(newTime, 64) + require.Nil(t, err) + + if uptimeOld > uptimeNew { + t.Fatalf("Node %s rebooted uptime decreased from %f to %f", infraNode.Name, uptimeOld, uptimeNew) + } + + t.Logf("Node %s didn't reboot as expected, uptime increased from %f to %f ", infraNode.Name, uptimeOld, uptimeNew) + + // Delete the applied authorized key MachineConfig to make sure rollback works fine without node reboot + if err := cs.MachineConfigs().Delete(context.TODO(), addAuthorizedKey.Name, metav1.DeleteOptions{}); err != nil { + t.Error(err) + } + + t.Logf("Deleted MachineConfig %s", addAuthorizedKey.Name) + + // Wait for the mcp to rollback to previous config + if err := waitForPoolComplete(t, cs, "infra", oldInfraRenderedConfig); err != nil { + t.Fatal(err) + } + + // Re-fetch the infra node for updated annotations + infraNode = getSingleNodeByRole(t, cs, "infra") + + assert.Equal(t, infraNode.Annotations[constants.CurrentMachineConfigAnnotationKey], oldInfraRenderedConfig) + assert.Equal(t, infraNode.Annotations[constants.MachineConfigDaemonStateAnnotationKey], constants.MachineConfigDaemonStateDone) + + foundSSHKey = execCmdOnNode(t, cs, infraNode, "cat", "/rootfs/home/core/.ssh/authorized_keys") + if strings.Contains(foundSSHKey, "test adding authorized key without node reboot") { + t.Fatalf("Node %s did not rollback successfully", infraNode.Name) + } + + t.Logf("Node %s has successfully rolled back", infraNode.Name) + + // Ensure that node didn't reboot during rollback + output = execCmdOnNode(t, cs, infraNode, "cat", "/rootfs/proc/uptime") + newTime = strings.Split(output, " ")[0] + + uptimeNew, err = strconv.ParseFloat(newTime, 64) + require.Nil(t, err) + + if uptimeOld > uptimeNew { + t.Fatalf("Node %s rebooted during rollback, uptime decreased from %f to %f", infraNode.Name, uptimeOld, uptimeNew) + } + + t.Logf("Node %s didn't reboot as expected during rollback, uptime increased from %f to %f ", infraNode.Name, uptimeOld, uptimeNew) + + unlabelFunc() + + workerMCP, err := cs.MachineConfigPools().Get(context.TODO(), "worker", metav1.GetOptions{}) + require.Nil(t, err) + if err := wait.Poll(2*time.Second, 5*time.Minute, func() (bool, error) { + node, err := cs.Nodes().Get(context.TODO(), infraNode.Name, metav1.GetOptions{}) + require.Nil(t, err) + if node.Annotations[constants.DesiredMachineConfigAnnotationKey] != workerMCP.Spec.Configuration.Name { + return false, nil + } + return true, nil + }); err != nil { + t.Errorf("infra node hasn't moved back to worker config: %v", err) + } + err = waitForPoolComplete(t, cs, "infra", oldInfraRenderedConfig) + require.Nil(t, err) +} + func TestPoolDegradedOnFailToRender(t *testing.T) { cs := framework.NewClientSet("") diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go index 401404774b..ec9eaf6dbb 100644 --- a/test/e2e/utils_test.go +++ b/test/e2e/utils_test.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "math/rand" + "os" "os/exec" "testing" "time" @@ -166,21 +167,24 @@ func createMC(name, role string) *mcfgv1.MachineConfig { // execCmdOnNode finds a node's mcd, and oc rsh's into it to execute a command on the node // all commands should use /rootfs as root -func execCmdOnNode(t *testing.T, cs *framework.ClientSet, node corev1.Node, args ...string) string { +func execCmdOnNode(t *testing.T, cs *framework.ClientSet, node corev1.Node, subArgs ...string) string { mcd, err := mcdForNode(cs, &node) require.Nil(t, err) mcdName := mcd.ObjectMeta.Name entryPoint := "oc" - cmd := []string{"rsh", + args := []string{"rsh", "-n", "openshift-machine-config-operator", "-c", "machine-config-daemon", mcdName} - cmd = append(cmd, args...) + args = append(args, subArgs...) - b, err := exec.Command(entryPoint, cmd...).CombinedOutput() - require.Nil(t, err, "failed to exec cmd %v on node %s: %s", args, node.Name, string(b)) - return string(b) + cmd := exec.Command(entryPoint, args...) + cmd.Stderr = os.Stderr + + out, err := cmd.Output() + require.Nil(t, err, "failed to exec cmd %v on node %s: %s", subArgs, node.Name, string(out)) + return string(out) } func mcLabelForRole(role string) map[string]string { From 0103ee19b464de9c567bdc64dbac5acded09b3de Mon Sep 17 00:00:00 2001 From: Yu Qi Zhang Date: Tue, 1 Dec 2020 17:19:52 -0500 Subject: [PATCH 7/8] reboot: add comments and clean up code Add some better comments for reboot work, add some events when not rebooting, and return better log messages. --- pkg/daemon/daemon.go | 4 ++-- pkg/daemon/update.go | 25 +++++++++++++++++-------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 912864f79d..9fd3019dc3 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -1176,9 +1176,9 @@ func (dn *Daemon) updateConfigAndState(state *stateAndConfigs) (bool, error) { glog.Infof("In desired config %s", state.currentConfig.GetName()) MCDUpdateState.WithLabelValues(state.currentConfig.GetName(), "").SetToCurrentTime() - - // All good! } + + // No errors have occurred. Returns true if currentConfig == desiredConfig, false otherwise (needs update) return inDesiredConfig, nil } diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 7fb8f84e4a..b64cd7093b 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -49,10 +49,14 @@ const ( extensionsRepo = "/etc/yum.repos.d/coreos-extensions.repo" osImageContentBaseDir = "/run/mco-machine-os-content/" - // These are the actions for a node to take after applying config changes. - // Defaults to reboot. - postConfigChangeActionNone = "none" - postConfigChangeActionReboot = "reboot" + // These are the actions for a node to take after applying config changes. (e.g. a new machineconfig is applied) + // "None" means no special action needs to be taken. A drain will still happen. + // This currently happens when ssh keys or pull secret (/var/lib/kubelet/config.json) is changed + postConfigChangeActionNone = "none" + // Rebooting is still the default scenario for any other change + postConfigChangeActionReboot = "reboot" + // Crio reload will happen when /etc/containers/registries.conf is changed. This will cause + // a "systemctl reload crio" postConfigChangeActionReloadCrio = "reload crio" ) @@ -114,14 +118,20 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } if ctrlcommon.InSlice(postConfigChangeActionNone, postConfigChangeActions) { + if dn.recorder != nil { + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Reboot", "Config changes do not require reboot.") + } dn.logSystem("Node has Desired Config %s, skipping reboot", configName) } if ctrlcommon.InSlice(postConfigChangeActionReloadCrio, postConfigChangeActions) { serviceName := "crio" if err := reloadService(serviceName); err != nil { - dn.logSystem("Reloading %s configuration failed, rebooting: %v", serviceName, err) + dn.logSystem("Reloading %s configuration failed, node will reboot instead. Error: %v", serviceName, err) dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } + if dn.recorder != nil { + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Reboot", "Config changes do not require reboot. Service %s was reloaded.", serviceName) + } dn.logSystem("%s config reloaded successfully! Desired config %s has been applied, skipping reboot", serviceName, configName) } @@ -130,13 +140,13 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string // Get current state of node, in case of an error reboot state, err := dn.getStateAndConfigs(configName) if err != nil { - glog.Errorf("Error processing state and configs, rebooting: %v", err) + glog.Errorf("Error processing state and configs, node will reboot instead. Error: %v", err) return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } var inDesiredConfig bool if inDesiredConfig, err = dn.updateConfigAndState(state); err != nil { - glog.Errorf("Setting node's state to Done failed, rebooting: %v", err) + glog.Errorf("Setting node's state to Done failed, node will reboot instead. Error: %v", err) return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } if inDesiredConfig { @@ -578,7 +588,6 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err dn.logSystem("Starting update from %s to %s: %+v", oldConfigName, newConfigName, diff) // TODO: consider how we should honor "force" flag here. Maybe if force, always reboot? - // TODO: consider if we should not cordon if no action needs to be taken actions, err := calculatePostConfigChangeAction(oldConfig, newConfig) if err != nil { return err From e25a6137d0b81d02ba1ea32e5162d8831c42ed7c Mon Sep 17 00:00:00 2001 From: Sinny Kumari Date: Wed, 2 Dec 2020 14:05:47 +0100 Subject: [PATCH 8/8] daemon: always set postConfigChangeActionReboot when force file is present Also, added event for crio service failure and updated event reasoning from Reboot to SkipReboot --- pkg/daemon/daemon.go | 3 --- pkg/daemon/update.go | 23 ++++++++++++++++++++--- test/e2e/mcd_test.go | 1 - 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 9fd3019dc3..4e244efb1a 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -1103,9 +1103,6 @@ func (dn *Daemon) checkStateOnFirstRun() error { glog.Info("Validated on-disk state") } else { glog.Infof("Skipping on-disk validation; %s present", constants.MachineConfigDaemonForceFile) - if err := os.Remove(constants.MachineConfigDaemonForceFile); err != nil { - return errors.Wrap(err, "failed to remove force validation file") - } return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig) } diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index b64cd7093b..85cd114205 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -117,20 +117,27 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string dn.logSystem("Rebooting node") return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } + if ctrlcommon.InSlice(postConfigChangeActionNone, postConfigChangeActions) { if dn.recorder != nil { - dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Reboot", "Config changes do not require reboot.") + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "SkipReboot", "Config changes do not require reboot.") } dn.logSystem("Node has Desired Config %s, skipping reboot", configName) } + if ctrlcommon.InSlice(postConfigChangeActionReloadCrio, postConfigChangeActions) { serviceName := "crio" + if err := reloadService(serviceName); err != nil { + if dn.recorder != nil { + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedServiceReload", fmt.Sprintf("Reloading %s service failed. Error: %v", serviceName, err)) + } dn.logSystem("Reloading %s configuration failed, node will reboot instead. Error: %v", serviceName, err) dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName)) } + if dn.recorder != nil { - dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Reboot", "Config changes do not require reboot. Service %s was reloaded.", serviceName) + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "SkipReboot", "Config changes do not require reboot. Service %s was reloaded.", serviceName) } dn.logSystem("%s config reloaded successfully! Desired config %s has been applied, skipping reboot", serviceName, configName) } @@ -519,6 +526,17 @@ func calculatePostConfigChangeActionFromFileDiffs(oldIgnConfig, newIgnConfig ign } func calculatePostConfigChangeAction(oldConfig, newConfig *mcfgv1.MachineConfig) ([]string, error) { + // If a machine-config-daemon-force file is present, it means the user wants to + // move to desired state without additional validation. We will reboot the node in + // this case regardless of what MachineConfig diff is. + if _, err := os.Stat(constants.MachineConfigDaemonForceFile); err == nil { + if err := os.Remove(constants.MachineConfigDaemonForceFile); err != nil { + return []string{}, errors.Wrap(err, "failed to remove force validation file") + } + glog.Infof("Setting post config change action to postConfigChangeActionReboot; %s present", constants.MachineConfigDaemonForceFile) + return []string{postConfigChangeActionReboot}, nil + } + diff, err := newMachineConfigDiff(oldConfig, newConfig) if err != nil { return []string{}, err @@ -587,7 +605,6 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err dn.logSystem("Starting update from %s to %s: %+v", oldConfigName, newConfigName, diff) - // TODO: consider how we should honor "force" flag here. Maybe if force, always reboot? actions, err := calculatePostConfigChangeAction(oldConfig, newConfig) if err != nil { return err diff --git a/test/e2e/mcd_test.go b/test/e2e/mcd_test.go index ee4b83ffb6..0a0c69dd6c 100644 --- a/test/e2e/mcd_test.go +++ b/test/e2e/mcd_test.go @@ -334,7 +334,6 @@ func TestNoReboot(t *testing.T) { infraNode := getSingleNodeByRole(t, cs, "infra") output := execCmdOnNode(t, cs, infraNode, "cat", "/rootfs/proc/uptime") - t.Logf("Debug: Initial uptime file content %s", output) oldTime := strings.Split(output, " ")[0] t.Logf("Node %s initial uptime: %s", infraNode.Name, oldTime)