Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 17 additions & 26 deletions pkg/daemon/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -726,38 +726,35 @@ func calculatePostConfigChangeAction(diff *machineConfigDiff, diffFileSet []stri
func (dn *Daemon) calculatePostConfigChangeNodeDisruptionAction(diff *machineConfigDiff, diffFileSet, diffUnitSet []string) ([]opv1.NodeDisruptionPolicyStatusAction, error) {

var mcop *opv1.MachineConfiguration
var err error
var pollErr error
// Wait for mcop.Status.NodeDisruptionPolicyStatus to populate, otherwise error out. This shouldn't take very long
// as this is done by the operator sync loop, but may be extended if transitioning to TechPreview as the operator restarts,
if err = wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
mcop, err = dn.mcopClient.OperatorV1().MachineConfigurations().Get(context.TODO(), ctrlcommon.MCOOperatorKnobsObjectName, metav1.GetOptions{})
if err != nil {
if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
mcop, pollErr = dn.mcopClient.OperatorV1().MachineConfigurations().Get(context.TODO(), ctrlcommon.MCOOperatorKnobsObjectName, metav1.GetOptions{})
if pollErr != nil {
klog.Errorf("calculating NodeDisruptionPolicies: MachineConfiguration/cluster has not been created yet")
err = fmt.Errorf("MachineConfiguration/cluster has not been created yet")
pollErr = fmt.Errorf("MachineConfiguration/cluster has not been created yet")
return false, nil
}
// There will always be atleast five file policies, if they don't exist, then the Status hasn't been populated yet
//
// TODO: When Conditions on this object are implemented; this check could be updated to only proceed when
// status.ObservedGeneration matches the last generation of MachineConfiguration
if len(mcop.Status.NodeDisruptionPolicyStatus.ClusterPolicies.Files) == 0 {
klog.Errorf("calculating NodeDisruptionPolicies: NodeDisruptionPolicyStatus has not been populated yet")
err = fmt.Errorf("NodeDisruptionPolicyStatus has not been populated yet")

// Ensure status.ObservedGeneration matches the last generation of MachineConfiguration
if mcop.Generation != mcop.Status.ObservedGeneration {
klog.Errorf("calculating NodeDisruptionPolicies: NodeDisruptionPolicyStatus is not up to date.")
pollErr = fmt.Errorf("NodeDisruptionPolicyStatus is not up to date")
return false, nil
}
return true, nil
}); err != nil {
klog.Errorf("NodeDisruptionPolicyStatus was not ready: %v", err)
err = fmt.Errorf("NodeDisruptionPolicyStatus was not ready: %v", err)
return nil, err
klog.Errorf("NodeDisruptionPolicyStatus was not ready: %v", pollErr)
return nil, fmt.Errorf("NodeDisruptionPolicyStatus was not ready: %v", pollErr)
}

// Continue policy calculation if no errors were encountered in fetching the policy.
// If a machine-config-daemon-force file is present, it means the user wants to
// move to desired state without additional validation. We will reboot the node in
// this case regardless of what MachineConfig diff is.
klog.Infof("Calculating node disruption actions")
if _, err = os.Stat(constants.MachineConfigDaemonForceFile); err == nil {
if _, err := os.Stat(constants.MachineConfigDaemonForceFile); err == nil {
if err = os.Remove(constants.MachineConfigDaemonForceFile); err != nil {
return []opv1.NodeDisruptionPolicyStatusAction{}, fmt.Errorf("failed to remove force validation file: %w", err)
}
Expand Down Expand Up @@ -1052,16 +1049,10 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
}

var nodeDisruptionActions []opv1.NodeDisruptionPolicyStatusAction
var nodeDisruptionError error
var actions []string
// If FeatureGateNodeDisruptionPolicy is set, calculate NodeDisruptionPolicy based actions for this MC diff
if fg != nil && fg.Enabled(features.FeatureGateNodeDisruptionPolicy) {
nodeDisruptionActions, nodeDisruptionError = dn.calculatePostConfigChangeNodeDisruptionAction(diff, diffFileSet, diffUnitSet)
if nodeDisruptionError != nil {
// TODO: Fallback to legacy path and signal failure here
klog.Errorf("could not calculate node disruption actions: %v", nodeDisruptionError)
actions, err = calculatePostConfigChangeAction(diff, diffFileSet)
}
nodeDisruptionActions, err = dn.calculatePostConfigChangeNodeDisruptionAction(diff, diffFileSet, diffUnitSet)
} else {
actions, err = calculatePostConfigChangeAction(diff, diffFileSet)
}
Expand All @@ -1083,13 +1074,13 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
}

var drain bool
if fg != nil && fg.Enabled(features.FeatureGateNodeDisruptionPolicy) && nodeDisruptionError == nil {
if fg != nil && fg.Enabled(features.FeatureGateNodeDisruptionPolicy) {
// Check actions list and perform node drain if required
drain, err = isDrainRequiredForNodeDisruptionActions(nodeDisruptionActions, oldIgnConfig, newIgnConfig)
if err != nil {
return err
}
klog.Infof("Drain calculated for node disruption: %v", drain)
klog.Infof("Drain calculated for node disruption: %v for config %s", drain, newConfigName)
} else {
// Check and perform node drain if required
crioOverrideConfigmapExists, err := dn.hasImageRegistryDrainOverrideConfigMap()
Expand Down Expand Up @@ -1285,7 +1276,7 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
klog.Errorf("Error making MCN for Updated Files and OS: %v", err)
}

if fg != nil && fg.Enabled(features.FeatureGateNodeDisruptionPolicy) && nodeDisruptionError == nil {
if fg != nil && fg.Enabled(features.FeatureGateNodeDisruptionPolicy) {
return dn.performPostConfigChangeNodeDisruptionAction(nodeDisruptionActions, newConfig.GetName())
}
// If we're here, FeatureGateNodeDisruptionPolicy is off/errored, so perform legacy action
Expand Down
15 changes: 9 additions & 6 deletions pkg/operator/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -2038,17 +2038,20 @@ func (optr *Operator) syncMachineConfiguration(_ *renderConfig) error {
}

// Merges the cluster's default node disruption policies with the user defined policies, if any.
newNodeDisruptionPolicyStatus := opv1.NodeDisruptionPolicyStatus{
ClusterPolicies: apihelpers.MergeClusterPolicies(mcop.Spec.NodeDisruptionPolicy),
newMachineConfigurationStatus := opv1.MachineConfigurationStatus{
NodeDisruptionPolicyStatus: opv1.NodeDisruptionPolicyStatus{
ClusterPolicies: apihelpers.MergeClusterPolicies(mcop.Spec.NodeDisruptionPolicy),
},
ObservedGeneration: mcop.GetGeneration(),
}

// Check if any changes are required in the Status before making the API call.
if !reflect.DeepEqual(mcop.Status.NodeDisruptionPolicyStatus, newNodeDisruptionPolicyStatus) {
klog.Infof("Updating NodeDisruptionPolicy status")
mcop.Status.NodeDisruptionPolicyStatus = newNodeDisruptionPolicyStatus
if !reflect.DeepEqual(mcop.Status, newMachineConfigurationStatus) {
klog.Infof("Updating MachineConfiguration status")
mcop.Status = newMachineConfigurationStatus
_, err = optr.mcopClient.OperatorV1().MachineConfigurations().UpdateStatus(context.TODO(), mcop, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("NodeDisruptionPolicy status apply failed: %v", err)
klog.Errorf("MachineConfiguration status apply failed: %v", err)
return nil
}
}
Expand Down
21 changes: 0 additions & 21 deletions test/e2e-shared-tests/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,27 +102,6 @@ func waitForConfigDriftMonitorStart(t *testing.T, cs *framework.ClientSet, node
require.Nil(t, err, "expected config drift monitor to start (waited %v)", end)
}

func assertLogsContain(t *testing.T, cs *framework.ClientSet, mcdPod *corev1.Pod, node *corev1.Node, expectedContents string) {
t.Helper()

logs, err := cs.Pods(mcdPod.Namespace).GetLogs(mcdPod.Name, &corev1.PodLogOptions{
Container: "machine-config-daemon",
}).DoRaw(context.TODO())
if err != nil {
// common err is that the mcd went down mid cmd. Re-try for good measure
mcdPod, err = helpers.MCDForNode(cs, node)
require.Nil(t, err)
logs, err = cs.Pods(mcdPod.Namespace).GetLogs(mcdPod.Name, &corev1.PodLogOptions{
Container: "machine-config-daemon",
}).DoRaw(context.TODO())
}
require.Nil(t, err)

if !strings.Contains(string(logs), expectedContents) {
t.Fatalf("expected to find '%s' in logs for %s/%s", expectedContents, mcdPod.Namespace, mcdPod.Name)
}
}

func assertNodeIsInDoneState(t *testing.T, cs *framework.ClientSet, node corev1.Node) {
t.Helper()

Expand Down
2 changes: 1 addition & 1 deletion test/e2e-shared-tests/mcd_config_drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ func assertNodeAndMCPIsDegraded(t *testing.T, cs *framework.ClientSet, node core
mcdPod, err := helpers.MCDForNode(cs, &node)
require.Nil(t, err)

assertLogsContain(t, cs, mcdPod, &node, logEntry)
helpers.AssertMCDLogsContain(t, cs, mcdPod, &node, logEntry)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: Nice!


// Assert that the MachineConfigPool eventually reaches a degraded state and has the config mismatch as the reason.
t.Log("Verifying MachineConfigPool becomes degraded due to config mismatch")
Expand Down
3 changes: 1 addition & 2 deletions test/e2e-techpreview/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"
corev1 "k8s.io/api/core/v1"
apierrs "k8s.io/apimachinery/pkg/api/errors"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
aggerrs "k8s.io/apimachinery/pkg/util/errors"
Expand Down Expand Up @@ -564,7 +563,7 @@ func copyEntitlementCerts(t *testing.T, cs *framework.ClientSet) func() {
return cloneSecret(t, cs, name, namespace, name, ctrlcommon.MCONamespace)
}

if apierrs.IsNotFound(err) {
if k8serrors.IsNotFound(err) {
t.Logf("Secret %q not found in %q, skipping test", name, namespace)
t.Skip()
return func() {}
Expand Down
2 changes: 1 addition & 1 deletion test/e2e-techpreview/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ func TestMain(m *testing.M) {

// Ensure required feature gates are set.
// Add any new feature gates to the test here, and remove them as features are GAed.
helpers.MustHaveFeatureGatesEnabled("ManagedBootImages", "OnClusterBuild", "MachineConfigNodes")
helpers.MustHaveFeatureGatesEnabled("ManagedBootImages", "OnClusterBuild", "MachineConfigNodes", "NodeDisruptionPolicy")

os.Exit(m.Run())
}
Loading