diff --git a/go.mod b/go.mod index 55ac86e6e1..efd87148e6 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/coreos/ignition/v2 v2.7.0 github.com/davecgh/go-spew v1.1.1 github.com/elazarl/goproxy v0.0.0-20190911111923-ecfe977594f1 // indirect + github.com/fsnotify/fsnotify v1.4.9 github.com/ghodss/yaml v1.0.0 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b github.com/golangci/golangci-lint v1.42.1 diff --git a/hack/build-go.sh b/hack/build-go.sh index ad69a6b6fd..ca1b16cdc7 100755 --- a/hack/build-go.sh +++ b/hack/build-go.sh @@ -43,5 +43,5 @@ if [[ $WHAT == "machine-config-controller" ]]; then GOTAGS="containers_image_openpgp exclude_graphdriver_devicemapper exclude_graphdriver_btrfs containers_image_ostree_stub" fi -echo "Building ${REPO}/cmd/${WHAT} (${VERSION_OVERRIDE}, ${HASH})" +echo "Building ${REPO}/cmd/${WHAT} (${VERSION_OVERRIDE}, ${HASH}) for $GOOS/$GOARCH" CGO_ENABLED=${CGO_ENABLED} GOOS=${GOOS} GOARCH=${GOARCH} go build -mod=vendor -tags="${GOTAGS}" -ldflags "${GLDFLAGS} -s -w" -o ${BIN_PATH}/${WHAT} ${REPO}/cmd/${WHAT} diff --git a/pkg/daemon/config_drift_monitor.go b/pkg/daemon/config_drift_monitor.go new file mode 100644 index 0000000000..915621d239 --- /dev/null +++ b/pkg/daemon/config_drift_monitor.go @@ -0,0 +1,370 @@ +package daemon + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "sync" + + ign2types "github.com/coreos/ignition/config/v2_2/types" + ign3types "github.com/coreos/ignition/v2/config/v3_2/types" + "github.com/fsnotify/fsnotify" + "github.com/golang/glog" + mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" + ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" + "k8s.io/apimachinery/pkg/util/sets" +) + +// Outermost error type for config drift errors +type configDriftErr error + +// Error type for file config drifts +type fileConfigDriftErr error + +// Error type for systemd unit config drifts +type unitConfigDriftErr error + +type ConfigDriftMonitor interface { + Start(ConfigDriftMonitorOpts) error + Done() <-chan struct{} + IsRunning() bool + Stop() +} + +type ConfigDriftMonitorOpts struct { + // Called whenever a config drift is detected. + OnDrift func(error) + // The currently applied MachineConfig. + MachineConfig *mcfgv1.MachineConfig + // The Systemd dropin path location. + // Defaults to /etc/systemd/system + SystemdPath string + // Channel to report unknown errors + ErrChan chan<- error +} + +// Holds the Config Drift Watcher and ensures we only have a single instance +// running at a given time. +type configDriftMonitor struct { + cdw *configDriftWatcher + mu sync.Mutex + stopCh chan struct{} +} + +// Implements the Config Drift Watcher +type configDriftWatcher struct { + ConfigDriftMonitorOpts + watcher *fsnotify.Watcher + filePaths sets.String + wg sync.WaitGroup + stopCh chan struct{} +} + +// Holds a single Config Drift Watcher and starts / stops it as necessary while +// ensuring that only a single Config Drift Monitor is running at any given +// time (within its scope). +func NewConfigDriftMonitor() ConfigDriftMonitor { + return &configDriftMonitor{ + mu: sync.Mutex{}, + stopCh: make(chan struct{}), + } +} + +// Determines if a Config Drift Watcher is running; useful for avoiding certain +// logic such as emitting a startup event. +func (c *configDriftMonitor) IsRunning() bool { + c.mu.Lock() + defer c.mu.Unlock() + return c.cdw != nil +} + +// Starts the Config Drift Monitor if not already started. It will create a new +// Config Drift Monitor instance, if necessary. +func (c *configDriftMonitor) Start(opts ConfigDriftMonitorOpts) error { + c.mu.Lock() + defer c.mu.Unlock() + + if c.cdw != nil { + return nil + } + + var err error + + // We don't have a Config Drift Watcher running, create one. + c.cdw, err = newConfigDriftWatcher(opts) + if err != nil { + return err + } + + c.cdw.start() + + return nil +} + +// Provides a channel for listening if the currently running Config Drift +// Monitor has been stopped. Can be used similar to <-ctx.Done(). +func (c *configDriftMonitor) Done() <-chan struct{} { + out := make(chan struct{}) + + go func() { + select { + case <-c.stopCh: + out <- struct{}{} + close(out) + return + } + }() + + return out +} + +// Stops the currently running Config Drift Monitor and deletes its instance so +// that it cannot be reused. Also sends a signal to the Done() listener. +func (c *configDriftMonitor) Stop() { + c.mu.Lock() + defer c.mu.Unlock() + + if c.cdw != nil { + c.cdw.stop() + c.cdw = nil + c.stopCh <- struct{}{} + } +} + +// Creates a new config drift watcher for a given machineconfig and registers +// a callback for when a drift occurs. +func newConfigDriftWatcher(opts ConfigDriftMonitorOpts) (*configDriftWatcher, error) { + if opts.OnDrift == nil { + return nil, fmt.Errorf("no ondrift function attached") + } + + if opts.ErrChan == nil { + return nil, fmt.Errorf("no error channel provided") + } + + if opts.MachineConfig == nil { + return nil, fmt.Errorf("no machine config provided") + } + + if opts.SystemdPath == "" { + opts.SystemdPath = pathSystemd + } + + c := &configDriftWatcher{ + ConfigDriftMonitorOpts: opts, + stopCh: make(chan struct{}), + } + + if err := c.initialize(); err != nil { + return nil, fmt.Errorf("could not initialize config drift monitor: %w", err) + } + + return c, nil +} + +// Performs the initial setup and initialization of the Config Drift Monitor, +// such as identifying files from the MachineConfig, wiring up the watchers, +// etc. +func (c *configDriftWatcher) initialize() error { + var err error + + // Initialize the fsnotify watcher + c.watcher, err = fsnotify.NewWatcher() + if err != nil { + return fmt.Errorf("could not start watcher: %w", err) + } + + glog.V(4).Infof("Initializing Config Drift Monitor") + glog.V(4).Infof("Current MachineConfig: %s", c.MachineConfig.Name) + + // Even though we wire up fsnotify to use the parent directories of each + // file, we keep track of the file paths individually so that we can ignore + // files in the same directory which are not referenced by the MachineConfig. + // This is useful when, for example, someone places a file in /etc using a + // MachineConfig (e.g., /etc/foo), but an unknown (to the MachineConfig) file + // in /etc (e.g., /etc/bar) is written to. + c.filePaths, err = getFilePathsFromMachineConfig(c.MachineConfig, c.SystemdPath) + if err != nil { + return fmt.Errorf("could not get file paths from machine config: %w", err) + } + + // fsnotify (presently) uses inotify instead of fanotify on Linux. + // See: https://github.com/fsnotify/fsnotify/issues/114 + // + // We must be judicious about the number of files we have wired up. However, + // inotify cannot recurse into directories. To work around these limitaions, + // we get the dirnames for all of the files in the Ignition config and dedupe + // them and attach watchers to those dirs. + dirPaths := getDirPathsFromFilePaths(c.filePaths) + + glog.V(4).Infof("Will watch %d directories containing %d files:", len(dirPaths), len(c.filePaths)) + + // Wire up fsnotify to watch our config dirs + for _, path := range dirPaths { + glog.V(4).Infof("Watching dir: \"%s\"", path) + if err := c.watcher.Add(path); err != nil { + return fmt.Errorf("could not add fsnotify watcher to dir \"%s\": %w", path, err) + } + } + + return nil +} + +// Starts the Config Drift Watcher +func (c *configDriftWatcher) start() { + c.wg = sync.WaitGroup{} + c.wg.Add(1) + + go func() { + defer c.wg.Done() + for { + select { + case event := <-c.watcher.Events: + // Our watcher is reporting an event that we should look at. + if err := c.handleFileEvent(event); err != nil { + // Send unknown file event errors to the error channel. + c.ErrChan <- err + } + case err := <-c.watcher.Errors: + // Send fsnotify errors directly to the error channel. + c.ErrChan <- fmt.Errorf("fsnotify error: %w", err) + case <-c.stopCh: + // We received a stop signal, shutdown our watcher. + c.watcher.Close() + return + } + } + }() + + glog.Info("Config Drift Monitor started") +} + +// Stops the watcher. +// Note: Once a Config Drift Watcher has been stopped, it cannot be started +// again. A new instance must be created. +func (c *configDriftWatcher) stop() { + c.stopCh <- struct{}{} + c.wg.Wait() + glog.Info("Config Drift Monitor has shut down") +} + +// Handles the filesystem event for any of the files we're watching and +// filters any config drift errors to the provided callback. +func (c *configDriftWatcher) handleFileEvent(event fsnotify.Event) error { + err := c.checkMachineConfigForEvent(event) + + if err == nil { + return nil + } + + var cdErr configDriftErr + if errors.As(err, &cdErr) { + c.OnDrift(cdErr) + // Don't bubble this error up further since it's handled by OnDrift. + return nil + } + + return fmt.Errorf("unknown config drift error: %w", err) +} + +// Validates on disk state for potential config drift. +func (c *configDriftWatcher) checkMachineConfigForEvent(event fsnotify.Event) error { + // Ignore events for files not found in the MachineConfig. + if !c.filePaths.Has(event.Name) { + return nil + } + + if err := validateOnDiskState(c.MachineConfig, c.SystemdPath); err != nil { + return configDriftErr(err) + } + + return nil +} + +// Finds the paths for all files in a given MachineConfig. +func getFilePathsFromMachineConfig(mc *mcfgv1.MachineConfig, systemdPath string) (sets.String, error) { + ignConfig, err := ctrlcommon.IgnParseWrapper(mc.Spec.Config.Raw) + if err != nil { + return sets.String{}, fmt.Errorf("could not get dirs from ignition config: %w", err) + } + + switch typedConfig := ignConfig.(type) { + case ign3types.Config: + return getFilePathsFromIgn3Config(ignConfig.(ign3types.Config), systemdPath), nil + case ign2types.Config: + return getFilePathsFromIgn2Config(ignConfig.(ign2types.Config), systemdPath), nil + default: + return sets.String{}, fmt.Errorf("unexpected type for ignition config: %v", typedConfig) + } +} + +// Extracts all unique directories from a given Ignition 3 config. +func getFilePathsFromIgn3Config(ignConfig ign3types.Config, systemdPath string) sets.String { + files := sets.NewString() + + // Get all the file paths from the ignition config + for _, ignFile := range ignConfig.Storage.Files { + if _, err := os.Stat(ignFile.Path); err == nil && !os.IsNotExist(err) { + files.Insert(ignFile.Path) + } + } + + // Get all the file paths for systemd dropins from the ignition config + for _, unit := range ignConfig.Systemd.Units { + unitPath := getIgn3SystemdUnitPath(systemdPath, unit) + if _, err := os.Stat(unitPath); err == nil && !os.IsNotExist(err) { + files.Insert(unitPath) + } + + for _, dropin := range unit.Dropins { + dropinPath := getIgn3SystemdDropinPath(systemdPath, unit, dropin) + if _, err := os.Stat(dropinPath); err == nil && !os.IsNotExist(err) { + files.Insert(dropinPath) + } + } + } + + return files +} + +// Extracts all unique directories from a given Ignition 2 config. +func getFilePathsFromIgn2Config(ignConfig ign2types.Config, systemdPath string) sets.String { + files := sets.NewString() + + // Get all the file paths from the ignition config + for _, ignFile := range ignConfig.Storage.Files { + if _, err := os.Stat(ignFile.Path); err == nil && !os.IsNotExist(err) { + files.Insert(ignFile.Path) + } + } + + // Get all the file paths for systemd dropins from the ignition config + for _, unit := range ignConfig.Systemd.Units { + unitPath := getIgn2SystemdUnitPath(systemdPath, unit) + if _, err := os.Stat(unitPath); err == nil && !os.IsNotExist(err) { + files.Insert(unitPath) + } + + for _, dropin := range unit.Dropins { + dropinPath := getIgn2SystemdDropinPath(systemdPath, unit, dropin) + if _, err := os.Stat(dropinPath); err == nil && !os.IsNotExist(err) { + files.Insert(dropinPath) + } + } + } + + return files +} + +// Gets the directories for all the MachineConfig file paths while +// deduplicating them. +func getDirPathsFromFilePaths(filePaths sets.String) []string { + dirPaths := sets.NewString() + + for filePath := range filePaths { + dirPaths.Insert(filepath.Dir(filePath)) + } + + return dirPaths.List() +} diff --git a/pkg/daemon/config_drift_monitor_test.go b/pkg/daemon/config_drift_monitor_test.go new file mode 100644 index 0000000000..026893bfff --- /dev/null +++ b/pkg/daemon/config_drift_monitor_test.go @@ -0,0 +1,430 @@ +package daemon + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "sync" + "testing" + "time" + + ign3types "github.com/coreos/ignition/v2/config/v3_2/types" + mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" + "github.com/openshift/machine-config-operator/test/helpers" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/util/uuid" +) + +func TestConfigDriftMonitor(t *testing.T) { + // Our errors. Their contents don't matter because we're not basing our + // assertions off of their contents. Instead, we're more concerned about + // their types. + fileErr := configDriftErr(fileConfigDriftErr(fmt.Errorf("file error"))) + unitErr := configDriftErr(unitConfigDriftErr(fmt.Errorf("unit error"))) + + // Filesystem Mutators + // These are closures to avoid namespace collisions and pollution since + // they're not useful outside of this test. + changeFileContent := func(path string) error { + return ioutil.WriteFile(path, []byte("notthecontents"), defaultFilePermissions) + } + + touchFile := func(path string) error { + now := time.Now() + return os.Chtimes(path, now.Local(), now.Local()) + } + + renameFile := func(path string) error { + return os.Rename(path, path+".old") + } + + overwriteFile := func(path string) error { + tmpPath := filepath.Join(filepath.Dir(path), "overwritten") + if err := writeFileAtomicallyWithDefaults(tmpPath, []byte("notthecontents")); err != nil { + return err + } + + return os.Rename(tmpPath, path) + } + + chmodFile := func(path string) error { + return os.Chmod(path, 0755) + } + + // The general idea for this test is as follows: + // 1. We create a temporary directory. + // 2. For each test case, we create an Ignition config as usual. + // 3. Apply the temp dir to paths in the testcase Ignition config. + // 4. Write the files to the temporary directory. + // 5. Start the Config Drift Monitor and supply a mock ondrift function. + // 6. Mutate the files on disk with our mutate func. + // 7. Wait a few milliseconds for the watcher to catch up. + // 8. Assert that we have config drift. + // 9. Shut down the monitor and move on. + // + // Note: We try to run the testcases in parallel for speed with each one + // mutating its own temp directory. + testCases := []configDriftMonitorTestCase{ + // Ignition File + // These target the file called /etc/a-config-file defined by the test + // fixture. + { + name: "ign file content drift", + expectedErr: fileErr, + mutateFile: changeFileContent, + }, + { + name: "ign file touch", + mutateFile: touchFile, + }, + { + name: "ign file rename", + expectedErr: fileErr, + mutateFile: renameFile, + }, + { + name: "ign file delete", + expectedErr: fileErr, + mutateFile: os.Remove, + }, + { + name: "ign file overwrite", + expectedErr: fileErr, + mutateFile: overwriteFile, + }, + { + name: "ign file chmod", + expectedErr: fileErr, + mutateFile: chmodFile, + }, + // Systemd Unit tests + // These target the systemd unit files in the test fixture: + // /etc/systemd/system/unittest.service + { + name: "ign unit content drift", + expectedErr: unitErr, + mutateUnit: changeFileContent, + }, + { + name: "ign unit touch", + mutateUnit: touchFile, + }, + { + name: "ign unit rename", + expectedErr: unitErr, + mutateUnit: renameFile, + }, + { + name: "ign unit delete", + expectedErr: unitErr, + mutateUnit: os.Remove, + }, + { + name: "ign unit overwrite", + expectedErr: unitErr, + mutateUnit: overwriteFile, + }, + { + name: "ign unit chmod", + expectedErr: unitErr, + mutateUnit: chmodFile, + }, + // Systemd Dropin tests + // These target the dropin files belonging to the unittest systemd unit in + // the test fixture: /etc/systemd/system/unittest.service.d/10-unittest-service.conf + { + name: "ign dropin content drift", + expectedErr: unitErr, + mutateDropin: changeFileContent, + }, + { + name: "ign dropin touch", + mutateDropin: touchFile, + }, + { + name: "ign dropin rename", + expectedErr: unitErr, + mutateDropin: renameFile, + }, + { + name: "ign dropin delete", + expectedErr: unitErr, + mutateDropin: os.Remove, + }, + { + name: "ign dropin overwrite", + expectedErr: unitErr, + mutateDropin: overwriteFile, + }, + { + name: "ign dropin chmod", + expectedErr: unitErr, + mutateDropin: chmodFile, + }, + } + + for _, testCase := range testCases { + testCase := testCase + t.Run(testCase.name, func(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + defer os.RemoveAll(tmpDir) + + testCase.tmpDir = tmpDir + testCase.systemdPath = filepath.Join(tmpDir, pathSystemd) + + testCase.run(t) + }) + } +} + +// Holds a testcase and its associated helper funcs +type configDriftMonitorTestCase struct { + // Name of the test case + name string + // The expected error, if any + expectedErr error + // The tmpdir for the test case (assigned at runtime) + tmpDir string + // The systemdroot for the test case (assigned at runtime) + systemdPath string + // Only one of these may be used per testcase: + // The mutation to apply to the Ignition file + mutateFile func(string) error + // The mutation to apply to the systemd unit file + mutateUnit func(string) error + // The mutation to apply to the systemd dropin file + mutateDropin func(string) error +} + +// Runs the test case +func (tc configDriftMonitorTestCase) run(t *testing.T) { + // Get our test fixtures + ignConfig, mc := tc.getFixtures(t) + + // Create our unexpected error channel + errChan := make(chan error, 5) + + // Create a new instance of the Config Drift Monitor + cdm := NewConfigDriftMonitor() + + var wg sync.WaitGroup + wg.Add(2) + + // Create a goroutine to listen on the Done() method. + doneCalled := false + go func() { + defer wg.Done() + select { + case <-cdm.Done(): + doneCalled = true + return + } + }() + + // Create a goroutine to listen on the supplied error channel. + go func() { + defer wg.Done() + for { + select { + case err, ok := <-errChan: + // Closing the channel causes a nil value to be sent. So we + // disambiguate a nil error object being sent from the channel closing. + if err != nil { + t.Errorf("unexpected error via error channel: %s", err) + } + + // Only break out of the loop if the error channel is closed. + if !ok { + return + } + } + } + }() + + onDriftCalled := false + + // To listen on when + onDriftChan := make(chan struct{}) + + // Configure the config drift monitor + opts := ConfigDriftMonitorOpts{ + ErrChan: errChan, + SystemdPath: tc.systemdPath, + MachineConfig: mc, + OnDrift: func(err error) { + go func() { + onDriftChan <- struct{}{} + }() + onDriftCalled = true + tc.onDriftFunc(t, err) + }, + } + + // Start the config drift monitor + require.Nil(t, cdm.Start(opts)) + + // Ensure that it's running + assert.True(t, cdm.IsRunning()) + + // Mutate the filesystem + require.Nil(t, tc.mutate(ignConfig)) + + timeout := 100 * time.Millisecond + start := time.Now() + + // Give the watcher time to fire or time out if it doesn't + select { + case <-onDriftChan: + t.Logf("Took %v to fire", time.Since(start)) + case <-time.After(timeout): + if tc.expectedErr != nil { + t.Errorf("expected onDrift to be called, but timed out after: %v", timeout) + } + } + + // Stop the config drift monitor + cdm.Stop() + + // Ensure that it's no longer running + assert.False(t, cdm.IsRunning()) + + // Closing our errChan causes the monitoring goroutine to stop. + close(errChan) + + // Wait for our listener goroutines to shut down + wg.Wait() + + // Ensure that our done channel was called + assert.True(t, doneCalled, "expected to receive a done event") + + // If we expect an error, make sure onDrift was called. Otherwise, make sure + // it wasn't called. + + if tc.expectedErr == nil { + assert.False(t, onDriftCalled, "expected onDrift not to be called") + } else { + assert.True(t, onDriftCalled, "expected onDrift to be called") + } +} + +// Creates the Ignition Config test fixture +func (tc configDriftMonitorTestCase) getIgnConfig() ign3types.Config { + return ign3types.Config{ + Ignition: ign3types.Ignition{ + Version: ign3types.MaxVersion.String(), + }, + Storage: ign3types.Storage{ + Files: []ign3types.File{ + helpers.CreateIgn3File("/etc/a-config-file", "data:,theafilecontents", int(defaultFilePermissions)), + }, + }, + Systemd: ign3types.Systemd{ + Units: []ign3types.Unit{ + { + Name: "unittest.service", + Contents: helpers.StrToPtr("unittest-unit-contents"), + Dropins: []ign3types.Dropin{ + { + Contents: helpers.StrToPtr("unittest-service-contents"), + Name: "10-unittest-service.conf", + }, + { + Name: "20-unittest-service.conf", + }, + }, + }, + }, + }, + } +} + +// Determines which file to mutate and runs the appropriate mutator. +func (tc configDriftMonitorTestCase) mutate(ignConfig ign3types.Config) error { + if tc.mutateFile != nil { + return tc.mutateFile(ignConfig.Storage.Files[0].Path) + } + + if tc.mutateDropin != nil { + dropinPath := getIgn3SystemdDropinPath(tc.systemdPath, ignConfig.Systemd.Units[0], ignConfig.Systemd.Units[0].Dropins[0]) + return tc.mutateDropin(dropinPath) + } + + if tc.mutateUnit != nil { + unitPath := getIgn3SystemdUnitPath(tc.systemdPath, ignConfig.Systemd.Units[0]) + return tc.mutateUnit(unitPath) + } + + return fmt.Errorf("no file mutator provided") +} + +// Creates and modifies the Ignition Config and converts it into a +// MachineConfig suitable for this test. +func (tc configDriftMonitorTestCase) getFixtures(t *testing.T) (ign3types.Config, *mcfgv1.MachineConfig) { + t.Helper() + + ignConfig := tc.getIgnConfig() + + // Prefix all the ignition files with the temp directory and write there. + for i, file := range ignConfig.Storage.Files { + file.Path = filepath.Join(tc.tmpDir, file.Path) + tc.writeFileForTest(t, file.Path, file.Contents.Source) + ignConfig.Storage.Files[i] = file + } + + // Prefix all the systemd files with the temp dir and write them. + for _, unit := range ignConfig.Systemd.Units { + unitPath := getIgn3SystemdUnitPath(tc.systemdPath, unit) + tc.writeFileForTest(t, unitPath, unit.Contents) + for _, dropin := range unit.Dropins { + dropinPath := getIgn3SystemdDropinPath(tc.systemdPath, unit, dropin) + tc.writeFileForTest(t, dropinPath, dropin.Contents) + } + } + + // Create a MachineConfig from our Ignition Config + mc := helpers.CreateMachineConfigFromIgnition(ignConfig) + mc.Name = "config-drift-monitor" + string(uuid.NewUUID()) + + return ignConfig, mc +} + +func (tc configDriftMonitorTestCase) onDriftFunc(t *testing.T, err error) { + // If we're not expecting a configDriftErr, we should not end up here. + if tc.expectedErr == nil { + t.Errorf("expected no config drift error, but got one anyway: %s", err) + } + + // Make sure that we get specific error types based upon the expected + // values + var cdErr configDriftErr + assert.ErrorAs(t, err, &cdErr) + + // If the testcase asks for a fileConfigDriftErr, be sure we got one. + var fErr fileConfigDriftErr + if errors.As(tc.expectedErr, &fErr) { + assert.ErrorAs(t, err, &fErr) + } + + // If the testcase asks for a unitConfigDriftErr, be sure we got one. + var uErr unitConfigDriftErr + if errors.As(tc.expectedErr, &uErr) { + assert.ErrorAs(t, err, &uErr) + } +} + +func (tc configDriftMonitorTestCase) writeFileForTest(t *testing.T, path string, contents *string) { + t.Helper() + // writeFiles doesn't actually work like this. Writing files this way was + // done for the sake of simplicity. + out := "" + if contents != nil { + out = strings.TrimPrefix(*contents, "data:,") + } + require.Nil(t, writeFileAtomicallyWithDefaults(path, []byte(out))) +} diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 0a86e0b54a..2c485a2b44 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -2,7 +2,6 @@ package daemon import ( "bufio" - "bytes" "context" "encoding/json" "fmt" @@ -18,10 +17,8 @@ import ( "syscall" "time" - ign2types "github.com/coreos/ignition/config/v2_2/types" ign3types "github.com/coreos/ignition/v2/config/v3_2/types" "github.com/golang/glog" - "github.com/google/go-cmp/cmp" "github.com/pkg/errors" "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" @@ -122,6 +119,9 @@ type Daemon struct { loggerSupportsJournal bool drainer *drain.Helper + + // Config Drift Monitor + configDriftMonitor ConfigDriftMonitor } const ( @@ -145,6 +145,7 @@ const ( // the MCC/MCO will time out. We don't want to spam our logs with the same // error. updateDelay = 5 * time.Second + // maxUpdateBackoff is the maximum time to react to a change as we back off // in the face of errors. maxUpdateBackoff = 60 * time.Second @@ -255,6 +256,7 @@ func New( exitCh: exitCh, currentConfigPath: currentConfigPath, loggerSupportsJournal: loggerSupportsJournal, + configDriftMonitor: NewConfigDriftMonitor(), }, nil } @@ -456,6 +458,10 @@ func (dn *Daemon) syncNode(key string) error { // currently we return immediately here, although // I think we should change this to continue. dn.booting = false + + // Start the Config Drift Monitor since we're booted up. + dn.startConfigDriftMonitor() + return nil } @@ -465,6 +471,11 @@ func (dn *Daemon) syncNode(key string) error { return errors.Wrapf(err, "prepping update") } if current != nil || desired != nil { + // Only check for config drift if we need to update. + if err := dn.runPreflightConfigDriftCheck(); err != nil { + return err + } + if err := dn.triggerUpdateWithMachineConfig(current, desired); err != nil { return err } @@ -473,6 +484,34 @@ func (dn *Daemon) syncNode(key string) error { return nil } +// Validates that the on-disk state matches the currently applied machineconfig +// before an update occurs. +func (dn *Daemon) runPreflightConfigDriftCheck() error { + // This allows skip behavior based upon the presence of + // the forcefile: /run/machine-config-daemon-force. + if forceFileExists() { + glog.Infof("Skipping preflight config drift check; %s present", constants.MachineConfigDaemonForceFile) + return nil + } + + currentOnDisk, err := dn.getCurrentConfigOnDisk() + if err != nil { + return fmt.Errorf("could not get on-disk config: %w", err) + } + + start := time.Now() + + if err := dn.validateOnDiskState(currentOnDisk); err != nil { + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "PreflightConfigDriftCheckFailed", err.Error()) + glog.Errorf("Preflight config drift check failed: %v", err) + return configDriftErr(err) + } + + glog.Infof("Preflight config drift check successful (took %s)", time.Since(start)) + + return nil +} + // enqueueDefault calls a default enqueue function func (dn *Daemon) enqueueDefault(node *corev1.Node) { key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(node) @@ -688,6 +727,72 @@ func (dn *Daemon) applySSHAccessedAnnotation() error { return nil } +// Called whenever the on-disk config has drifted from the current machineconfig. +func (dn *Daemon) onConfigDrift(err error) { + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "ConfigDriftDetected", err.Error()) + glog.Error(err) + dn.updateErrorState(err) +} + +func (dn *Daemon) startConfigDriftMonitor() { + // Even though the Config Drift Monitor object ensures that only a single + // Config Drift Watcher is running at any given time, other things, such as + // emitting Kube events on startup, should only occur if we weren't + // previously running. This provides us with a way to short-circuit that path + // if we already have a Config Drift Watcher running. + if dn.configDriftMonitor.IsRunning() { + return + } + + currentConfig, err := dn.getCurrentConfigOnDisk() + if err != nil { + dn.exitCh <- fmt.Errorf("could not get current config from disk: %w", err) + return + } + + opts := ConfigDriftMonitorOpts{ + OnDrift: dn.onConfigDrift, + SystemdPath: pathSystemd, + ErrChan: dn.exitCh, + MachineConfig: currentConfig, + } + + if err := dn.configDriftMonitor.Start(opts); err != nil { + dn.exitCh <- fmt.Errorf("could not start Config Drift Monitor: %w", err) + return + } + + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "ConfigDriftMonitorStarted", + "Config Drift Monitor started, watching against %s", currentConfig.Name) + + go func() { + // Common shutdown function + shutdown := func() { + // Stop the Config Drift Monitor, if it's not already stopped. + dn.configDriftMonitor.Stop() + // Report that we've shut down + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "ConfigDriftMonitorStopped", "Config Drift Monitor stopped") + } + + for { + select { + case <-dn.stopCh: + // We got a stop signal from outside the MCD. + shutdown() + return + case <-dn.configDriftMonitor.Done(): + // We got a stop signal from the Config Drift Monitor. + shutdown() + return + } + } + }() +} + +func (dn *Daemon) stopConfigDriftMonitor() { + dn.configDriftMonitor.Stop() +} + func (dn *Daemon) runKubeletHealthzMonitor(stopCh <-chan struct{}, exitCh chan<- error) { failureCount := 0 KubeletHealthState.Set(float64(failureCount)) @@ -1092,16 +1197,19 @@ func (dn *Daemon) checkStateOnFirstRun() error { expectedConfig = state.currentConfig } - if _, err := os.Stat(constants.MachineConfigDaemonForceFile); err != nil { - if err := dn.validateOnDiskState(expectedConfig); err != nil { - return fmt.Errorf("unexpected on-disk state validating against %s: %v", expectedConfig.GetName(), err) - } - glog.Info("Validated on-disk state") - } else { + if forceFileExists() { glog.Infof("Skipping on-disk validation; %s present", constants.MachineConfigDaemonForceFile) return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig) } + if err := dn.validateOnDiskState(expectedConfig); err != nil { + wErr := fmt.Errorf("unexpected on-disk state validating against %s: %w", expectedConfig.GetName(), err) + dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "OnDiskStateValidationFailed", wErr.Error()) + return wErr + } + + glog.Info("Validated on-disk state") + // We've validated state. Now, ensure that node is in desired state var inDesiredConfig bool if inDesiredConfig, err = dn.updateConfigAndState(state); err != nil { @@ -1320,6 +1428,10 @@ func (dn *Daemon) triggerUpdateWithMachineConfig(currentConfig, desiredConfig *m } } + // Shut down the Config Drift Monitor since we'll be performing an update + // and the config will "drift" while the update is occurring. + dn.stopConfigDriftMonitor() + // run the update process. this function doesn't currently return. return dn.update(currentConfig, desiredConfig) } @@ -1334,34 +1446,8 @@ func (dn *Daemon) validateOnDiskState(currentConfig *mcfgv1.MachineConfig) error if !osMatch { return errors.Errorf("expected target osImageURL %q, have %q", currentConfig.Spec.OSImageURL, dn.bootedOSImageURL) } - // And the rest of the disk state - // We want to verify the disk state in the spec version that it was created with, - // to remove possibilities of behaviour changes due to translation - ignconfigi, err := ctrlcommon.IgnParseWrapper(currentConfig.Spec.Config.Raw) - if err != nil { - return errors.Errorf("Failed to parse Ignition for validation: %s", err) - } - switch typedConfig := ignconfigi.(type) { - case ign3types.Config: - if err := checkV3Files(ignconfigi.(ign3types.Config).Storage.Files); err != nil { - return err - } - if err := checkV3Units(ignconfigi.(ign3types.Config).Systemd.Units); err != nil { - return err - } - return nil - case ign2types.Config: - if err := checkV2Files(ignconfigi.(ign2types.Config).Storage.Files); err != nil { - return err - } - if err := checkV2Units(ignconfigi.(ign2types.Config).Systemd.Units); err != nil { - return err - } - return nil - default: - return errors.Errorf("unexpected type for ignition config: %v", typedConfig) - } + return validateOnDiskState(currentConfig, pathSystemd) } // compareOSImageURL checks whether the current and desired @@ -1390,164 +1476,6 @@ func (dn *Daemon) checkOS(osImageURL string) bool { return compareOSImageURL(dn.bootedOSImageURL, osImageURL) } -// checkUnits validates the contents of all the units in the -// target config and returns true if they match. -func checkV3Units(units []ign3types.Unit) error { - for _, u := range units { - for j := range u.Dropins { - path := filepath.Join(pathSystemd, u.Name+".d", u.Dropins[j].Name) - - var content string - if u.Dropins[j].Contents == nil { - content = "" - } else { - content = *u.Dropins[j].Contents - } - - // As of 4.7 we now remove any empty defined dropins, check for that first - if _, err := os.Stat(path); content == "" && err != nil { - if os.IsNotExist(err) { - continue - } - return err - } - - // To maintain backwards compatibility, we allow existing zero length files to exist. - // Thus we are also ok if the dropin exists but has no content - if err := checkFileContentsAndMode(path, []byte(content), defaultFilePermissions); err != nil { - return err - } - } - - if u.Contents == nil || *u.Contents == "" { - continue - } - - path := filepath.Join(pathSystemd, u.Name) - if u.Mask != nil && *u.Mask { - link, err := filepath.EvalSymlinks(path) - if err != nil { - return errors.Wrapf(err, "state validation: error while evaluation symlink for path %q", path) - } - if strings.Compare(pathDevNull, link) != 0 { - return errors.Errorf("state validation: invalid unit masked setting. path: %q; expected: %v; received: %v", path, pathDevNull, link) - } - } - if err := checkFileContentsAndMode(path, []byte(*u.Contents), defaultFilePermissions); err != nil { - return err - } - - } - return nil -} - -func checkV2Units(units []ign2types.Unit) error { - for _, u := range units { - for j := range u.Dropins { - path := filepath.Join(pathSystemd, u.Name+".d", u.Dropins[j].Name) - if err := checkFileContentsAndMode(path, []byte(u.Dropins[j].Contents), defaultFilePermissions); err != nil { - return err - } - } - - if u.Contents == "" { - continue - } - - path := filepath.Join(pathSystemd, u.Name) - if u.Mask { - link, err := filepath.EvalSymlinks(path) - if err != nil { - return errors.Wrapf(err, "state validation: error while evaluation symlink for path %q", path) - } - if strings.Compare(pathDevNull, link) != 0 { - return errors.Errorf("state validation: invalid unit masked setting. path: %q; expected: %v; received: %v", path, pathDevNull, link) - } - } - if err := checkFileContentsAndMode(path, []byte(u.Contents), defaultFilePermissions); err != nil { - return err - } - - } - return nil -} - -// checkFiles validates the contents of all the files in the -// target config. - -// V3 files should not have any duplication anymore, so there is -// no need to check for overwrites. -func checkV3Files(files []ign3types.File) error { - for _, f := range files { - if len(f.Append) > 0 { - return fmt.Errorf("found an append section when checking files. Append is not supported") - } - mode := defaultFilePermissions - if f.Mode != nil { - mode = os.FileMode(*f.Mode) - } - decodedContents, err := decodeContents(f.Contents.Source, f.Contents.Compression) - if err != nil { - return fmt.Errorf("could not decode file %q: %w", f.Path, err) - } - - if err := checkFileContentsAndMode(f.Path, decodedContents, mode); err != nil { - return err - } - } - return nil -} - -func checkV2Files(files []ign2types.File) error { - checkedFiles := make(map[string]bool) - for i := len(files) - 1; i >= 0; i-- { - f := files[i] - // skip over checked validated files - if _, ok := checkedFiles[f.Path]; ok { - continue - } - if f.Append { - return fmt.Errorf("found an append section when checking files. Append is not supported") - } - mode := defaultFilePermissions - if f.Mode != nil { - mode = os.FileMode(*f.Mode) - } - decodedContents, err := decodeContents(&f.Contents.Source, &f.Contents.Compression) - if err != nil { - return fmt.Errorf("could not decode file %q: %w", f.Path, err) - } - if err := checkFileContentsAndMode(f.Path, decodedContents, mode); err != nil { - return err - } - checkedFiles[f.Path] = true - } - return nil -} - -// checkFileContentsAndMode reads the file from the filepath and compares its -// contents and mode with the expectedContent and mode parameters. It logs an -// error in case of an error or mismatch and returns the status of the -// evaluation. -func checkFileContentsAndMode(filePath string, expectedContent []byte, mode os.FileMode) error { - fi, err := os.Lstat(filePath) - if err != nil { - return errors.Wrapf(err, "could not stat file %q", filePath) - } - if fi.Mode() != mode { - return errors.Errorf("mode mismatch for file: %q; expected: %[2]v/%[2]d/%#[2]o; received: %[3]v/%[3]d/%#[3]o", filePath, mode, fi.Mode()) - } - contents, err := ioutil.ReadFile(filePath) - if err != nil { - return errors.Wrapf(err, "could not read file %q", filePath) - } - if !bytes.Equal(contents, expectedContent) { - glog.Errorf("content mismatch for file %q (-want +got):\n%s", filePath, cmp.Diff(expectedContent, contents)) - return errors.Errorf("content mismatch for file %q", filePath) - } - return nil -} - // Close closes all the connections the node agent has open for it's lifetime func (dn *Daemon) Close() { } @@ -1637,3 +1565,15 @@ func (dn *Daemon) getControlPlaneTopology() configv1.TopologyMode { return configv1.HighlyAvailableTopologyMode } } + +// forceFileExists determines if /run/machine-config-daemon-force is present. +func forceFileExists() bool { + _, err := os.Stat(constants.MachineConfigDaemonForceFile) + + // No error means we could stat the file; it exists + if err == nil { + return true + } + + return false +} diff --git a/pkg/daemon/on_disk_validation.go b/pkg/daemon/on_disk_validation.go new file mode 100644 index 0000000000..44c0c41e20 --- /dev/null +++ b/pkg/daemon/on_disk_validation.go @@ -0,0 +1,247 @@ +package daemon + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + + ign2types "github.com/coreos/ignition/config/v2_2/types" + ign3types "github.com/coreos/ignition/v2/config/v3_2/types" + "github.com/golang/glog" + "github.com/google/go-cmp/cmp" + mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" + ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" + "github.com/pkg/errors" + "github.com/vincent-petithory/dataurl" +) + +// Validates that the on-disk state matches a given MachineConfig. +func validateOnDiskState(currentConfig *mcfgv1.MachineConfig, systemdPath string) error { + // And the rest of the disk state + // We want to verify the disk state in the spec version that it was created with, + // to remove possibilities of behaviour changes due to translation + ignconfigi, err := ctrlcommon.IgnParseWrapper(currentConfig.Spec.Config.Raw) + if err != nil { + return errors.Errorf("Failed to parse Ignition for validation: %s", err) + } + + switch typedConfig := ignconfigi.(type) { + case ign3types.Config: + if err := checkV3Files(ignconfigi.(ign3types.Config).Storage.Files); err != nil { + return fileConfigDriftErr(err) + } + if err := checkV3Units(ignconfigi.(ign3types.Config).Systemd.Units, systemdPath); err != nil { + return unitConfigDriftErr(err) + } + return nil + case ign2types.Config: + if err := checkV2Files(ignconfigi.(ign2types.Config).Storage.Files); err != nil { + return fileConfigDriftErr(err) + } + if err := checkV2Units(ignconfigi.(ign2types.Config).Systemd.Units, systemdPath); err != nil { + return unitConfigDriftErr(err) + } + return nil + default: + return errors.Errorf("unexpected type for ignition config: %v", typedConfig) + } +} + +// Checks that the ondisk state for a systemd dropin matches the expected state. +func checkV3Dropin(systemdPath string, unit ign3types.Unit, dropin ign3types.Dropin) error { + path := getIgn3SystemdDropinPath(systemdPath, unit, dropin) + + var content string + if dropin.Contents == nil { + content = "" + } else { + content = *dropin.Contents + } + + // As of 4.7 we now remove any empty defined dropins, check for that first + if _, err := os.Stat(path); content == "" && err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + + // To maintain backwards compatibility, we allow existing zero length files to exist. + // Thus we are also ok if the dropin exists but has no content + return checkFileContentsAndMode(path, []byte(content), defaultFilePermissions) +} + +// checkV3Units validates the contents of all the units in the +// target config and returns nil if they match. +func checkV3Units(units []ign3types.Unit, systemdPath string) error { + for _, unit := range units { + for _, dropin := range unit.Dropins { + if err := checkV3Dropin(systemdPath, unit, dropin); err != nil { + return err + } + } + + if unit.Contents == nil || *unit.Contents == "" { + continue + } + + path := getIgn3SystemdUnitPath(systemdPath, unit) + if unit.Mask != nil && *unit.Mask { + link, err := filepath.EvalSymlinks(path) + if err != nil { + return errors.Wrapf(err, "state validation: error while evaluation symlink for path %q", path) + } + if strings.Compare(pathDevNull, link) != 0 { + return errors.Errorf("state validation: invalid unit masked setting. path: %q; expected: %v; received: %v", path, pathDevNull, link) + } + } + if err := checkFileContentsAndMode(path, []byte(*unit.Contents), defaultFilePermissions); err != nil { + return err + } + + } + return nil +} + +// checkV2Units validates the contents of all the units in the +// target config and returns nil if they match. +func checkV2Units(units []ign2types.Unit, systemdPath string) error { + for _, unit := range units { + for _, dropin := range unit.Dropins { + path := getIgn2SystemdDropinPath(systemdPath, unit, dropin) + if err := checkFileContentsAndMode(path, []byte(dropin.Contents), defaultFilePermissions); err != nil { + return err + } + } + + if unit.Contents == "" { + continue + } + + path := getIgn2SystemdUnitPath(systemdPath, unit) + if unit.Mask { + link, err := filepath.EvalSymlinks(path) + if err != nil { + return errors.Wrapf(err, "state validation: error while evaluation symlink for path %q", path) + } + if strings.Compare(pathDevNull, link) != 0 { + return errors.Errorf("state validation: invalid unit masked setting. path: %q; expected: %v; received: %v", path, pathDevNull, link) + } + } + if err := checkFileContentsAndMode(path, []byte(unit.Contents), defaultFilePermissions); err != nil { + return err + } + + } + return nil +} + +// checkV3Files validates the contents of all the files in the target config. +// V3 files should not have any duplication anymore, so there is no need to +// check for overwrites. +func checkV3Files(files []ign3types.File) error { + for _, f := range files { + if len(f.Append) > 0 { + return fmt.Errorf("found an append section when checking files. Append is not supported") + } + mode := defaultFilePermissions + if f.Mode != nil { + mode = os.FileMode(*f.Mode) + } + contents := &dataurl.DataURL{} + if f.Contents.Source != nil { + var err error + contents, err = dataurl.DecodeString(*f.Contents.Source) + if err != nil { + return errors.Wrapf(err, "couldn't parse file %q", f.Path) + } + } + if err := checkFileContentsAndMode(f.Path, contents.Data, mode); err != nil { + return err + } + } + return nil +} + +// checkV2Files validates the contents of all the files in the target config. +func checkV2Files(files []ign2types.File) error { + checkedFiles := make(map[string]bool) + for i := len(files) - 1; i >= 0; i-- { + f := files[i] + // skip over checked validated files + if _, ok := checkedFiles[f.Path]; ok { + continue + } + if f.Append { + return fmt.Errorf("found an append section when checking files. Append is not supported") + } + mode := defaultFilePermissions + if f.Mode != nil { + mode = os.FileMode(*f.Mode) + } + contents, err := dataurl.DecodeString(f.Contents.Source) + if err != nil { + return errors.Wrapf(err, "couldn't parse file %q", f.Path) + } + if err := checkFileContentsAndMode(f.Path, contents.Data, mode); err != nil { + return err + } + checkedFiles[f.Path] = true + } + return nil +} + +// checkFileContentsAndMode reads the file from the filepath and compares its +// contents and mode with the expectedContent and mode parameters. It logs an +// error in case of an error or mismatch and returns the status of the +// evaluation. +func checkFileContentsAndMode(filePath string, expectedContent []byte, mode os.FileMode) error { + fi, err := os.Lstat(filePath) + if err != nil { + return errors.Wrapf(err, "could not stat file %q", filePath) + } + if fi.Mode() != mode { + return errors.Errorf("mode mismatch for file: %q; expected: %[2]v/%[2]d/%#[2]o; received: %[3]v/%[3]d/%#[3]o", filePath, mode, fi.Mode()) + } + contents, err := ioutil.ReadFile(filePath) + if err != nil { + return errors.Wrapf(err, "could not read file %q", filePath) + } + if !bytes.Equal(contents, expectedContent) { + glog.Errorf("content mismatch for file %q (-want +got):\n%s", filePath, cmp.Diff(expectedContent, contents)) + return errors.Errorf("content mismatch for file %q", filePath) + } + return nil +} + +// Gets the absolute path for a systemd unit and dropin, given a root path. +func getIgn2SystemdDropinPath(systemdPath string, unit ign2types.Unit, dropin ign2types.SystemdDropin) string { + return filepath.Join(getSystemdPath(systemdPath), unit.Name+".d", dropin.Name) +} + +// Gets the absolute path for a systemd unit and dropin, given a root path. +func getIgn3SystemdDropinPath(systemdPath string, unit ign3types.Unit, dropin ign3types.Dropin) string { + return filepath.Join(getSystemdPath(systemdPath), unit.Name+".d", dropin.Name) +} + +// Computes the absolute path for a given system unit file. +func getIgn2SystemdUnitPath(systemdPath string, unit ign2types.Unit) string { + return filepath.Join(getSystemdPath(systemdPath), unit.Name) +} + +// Computes the absolute path for a given system unit file. +func getIgn3SystemdUnitPath(systemdPath string, unit ign3types.Unit) string { + return filepath.Join(getSystemdPath(systemdPath), unit.Name) +} + +// Gets the systemd path. Defaults to pathSystemd, if empty. +func getSystemdPath(systemdPath string) string { + if systemdPath == "" { + systemdPath = pathSystemd + } + + return systemdPath +} diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index cccc4f57e7..b6aee96981 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -159,6 +159,8 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string return fmt.Errorf("Could not apply update: setting node's state to Done failed. Error: %v", err) } if inDesiredConfig { + // (re)start the config drift monitor since rebooting isn't needed. + dn.startConfigDriftMonitor() return nil } diff --git a/test/e2e-shared-tests/README.md b/test/e2e-shared-tests/README.md new file mode 100644 index 0000000000..a5fd796a7d --- /dev/null +++ b/test/e2e-shared-tests/README.md @@ -0,0 +1,16 @@ +# e2e Shared Tests + +To reduce duplication across multiple e2e test suites, tests which can be +reused across all MCO test suites should live within this package. + +## To Add A New Shared Test + +1. Create a new file in this directory named after your test. Do not follow the + Golang `_test.go` convention since symbols within files with the `_test.go` + suffix cannot be imported from another package. +2. Create a struct implementing the `SharedTest` interface to contain your test. +3. In `shared.go`, add any options your test may require to the `SharedTestOpts` struct. +4. Wire your new test struct into the testCase in `shared.go`. +5. Update the `TestRunShared` tests in `./test/e2e/mcd_test.go` and + `./test/e2e-single-node/sno_mcd_test.go` with the opts struct your new test + takes (if it takes one). diff --git a/test/e2e-shared-tests/helpers.go b/test/e2e-shared-tests/helpers.go new file mode 100644 index 0000000000..736e55ca70 --- /dev/null +++ b/test/e2e-shared-tests/helpers.go @@ -0,0 +1,266 @@ +package e2e_shared_test + +import ( + "context" + "fmt" + "path/filepath" + "strings" + "testing" + "time" + + mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" + "github.com/openshift/machine-config-operator/pkg/daemon/constants" + "github.com/openshift/machine-config-operator/test/framework" + "github.com/openshift/machine-config-operator/test/helpers" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + kubeErrs "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/wait" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func assertKernelArgsEqual(t *testing.T, cs *framework.ClientSet, target corev1.Node, startKargs string, expectedKargs []string) { + t.Helper() + + kargs := getKernelArgs(t, cs, target) + + for _, karg := range expectedKargs { + assert.Contains(t, kargs, karg, "Missing karg %s", karg) + } + + assert.Equal(t, kargs, startKargs) +} + +func getKernelArgs(t *testing.T, cs *framework.ClientSet, target corev1.Node) string { + return helpers.ExecCmdOnNode(t, cs, target, "chroot", "/rootfs", "/usr/bin/rpm-ostree", "kargs") +} + +func waitForConfigDriftMonitorStart(t *testing.T, cs *framework.ClientSet, node corev1.Node) { + t.Helper() + + // With the way that the Config Drift Monitor is wired into the MCD, + // "machineconfiguration.openshift.io/state" gets set to "Done" before the + // Config Drift Monitor is started. This is fine, however it makes testing a + // bit tricky as we rely upon "machineconfiguration.openshift.io/state" to be + // "Done" before we commence our test. + // + // Because of this, we must infer that the Config Drift Monitor is started by + // looking at the MCD pod logs, hence this function. It's also worth noting + // that the Config Drift Monitor text may not immediately appear in the log, + // so we have to poll for it. + + start := time.Now() + + err := wait.PollImmediate(2*time.Second, 1*time.Minute, func() (bool, error) { + mcdPod, err := helpers.MCDForNode(cs, &node) + require.Nil(t, err) + + logs, err := cs.Pods(mcdPod.Namespace).GetLogs(mcdPod.Name, &corev1.PodLogOptions{ + Container: "machine-config-daemon", + }).DoRaw(context.TODO()) + require.Nil(t, err) + + return strings.Contains(string(logs), configDriftMonitorStartupMsg), nil + }) + + end := time.Since(start) + + if err == nil { + t.Logf("Config Drift Monitor is running (waited %v)", end) + } + + require.Nil(t, err, "expected config drift monitor to start (waited %v)", end) +} + +func assertNodeAndMCPIsDegraded(t *testing.T, cs *framework.ClientSet, node corev1.Node, mcp mcfgv1.MachineConfigPool, filename string) { + t.Helper() + + logEntry := fmt.Sprintf("content mismatch for file \"%s\"", filename) + + // Assert that the node eventually reaches a Degraded state and has the + // config mismatch as the reason + t.Log("Verifying node becomes degraded due to config mismatch") + + assertNodeReachesState(t, cs, node, func(n corev1.Node) bool { + isDegraded := n.Annotations[constants.MachineConfigDaemonStateAnnotationKey] == string(constants.MachineConfigDaemonStateDegraded) + hasReason := strings.Contains(n.Annotations[constants.MachineConfigDaemonReasonAnnotationKey], logEntry) + return isDegraded && hasReason + }) + + mcdPod, err := helpers.MCDForNode(cs, &node) + require.Nil(t, err) + + assertLogsContain(t, cs, mcdPod, logEntry) + + // Assert that the MachineConfigPool eventually reaches a degraded state and has the config mismatch as the reason. + t.Log("Verifying MachineConfigPool becomes degraded due to config mismatch") + + assertPoolReachesState(t, cs, mcp, func(m mcfgv1.MachineConfigPool) bool { + trueConditions := []mcfgv1.MachineConfigPoolConditionType{ + mcfgv1.MachineConfigPoolDegraded, + mcfgv1.MachineConfigPoolNodeDegraded, + mcfgv1.MachineConfigPoolUpdating, + } + + falseConditions := []mcfgv1.MachineConfigPoolConditionType{ + mcfgv1.MachineConfigPoolRenderDegraded, + mcfgv1.MachineConfigPoolUpdated, + } + + return m.Status.DegradedMachineCount == 1 && + allMCPConditionsTrue(trueConditions, m) && + allMCPConditionsFalse(falseConditions, m) + }) +} + +func assertLogsContain(t *testing.T, cs *framework.ClientSet, mcdPod *corev1.Pod, expectedContents string) { + logs, err := cs.Pods(mcdPod.Namespace).GetLogs(mcdPod.Name, &corev1.PodLogOptions{ + Container: "machine-config-daemon", + }).DoRaw(context.TODO()) + require.Nil(t, err) + + if !strings.Contains(string(logs), expectedContents) { + t.Fatalf("expected to find '%s' in logs for %s/%s", expectedContents, mcdPod.Namespace, mcdPod.Name) + } +} + +func assertNodeAndMCPIsRecovered(t *testing.T, cs *framework.ClientSet, node corev1.Node, mcp mcfgv1.MachineConfigPool) { + t.Helper() + + t.Log("Verifying node has recovered from config mismatch") + // Assert that the node eventually reaches a Done state and its reason is + // cleared + assertNodeReachesState(t, cs, node, func(n corev1.Node) bool { + isDone := n.Annotations[constants.MachineConfigDaemonStateAnnotationKey] == "Done" + hasClearedReason := n.Annotations[constants.MachineConfigDaemonReasonAnnotationKey] == "" + return isDone && hasClearedReason + }) + + t.Log("Verifying MachineConfigPool has recovered from config mismatch") + // Assert that the MachineConfigPool eventually recovers. + assertPoolReachesState(t, cs, mcp, func(m mcfgv1.MachineConfigPool) bool { + falseConditions := []mcfgv1.MachineConfigPoolConditionType{ + mcfgv1.MachineConfigPoolDegraded, + mcfgv1.MachineConfigPoolNodeDegraded, + mcfgv1.MachineConfigPoolRenderDegraded, + mcfgv1.MachineConfigPoolUpdating, + } + + trueConditions := []mcfgv1.MachineConfigPoolConditionType{ + mcfgv1.MachineConfigPoolUpdated, + } + + return m.Status.DegradedMachineCount == 0 && + allMCPConditionsTrue(trueConditions, m) && + allMCPConditionsFalse(falseConditions, m) + }) +} + +func assertNodeReachesState(t *testing.T, cs *framework.ClientSet, target corev1.Node, stateFunc func(corev1.Node) bool) { + t.Helper() + + maxWait := 5 * time.Minute + + end, err := pollForResourceState(maxWait, func() (bool, error) { + node, err := cs.Nodes().Get(context.TODO(), target.Name, metav1.GetOptions{}) + return stateFunc(*node), err + }) + + if err != nil { + t.Fatalf("Node %s did not reach expected state (took %v): %s", target.Name, end, err) + } + + t.Logf("Node %s reached expected state (took %v)", target.Name, end) +} + +func assertPoolReachesState(t *testing.T, cs *framework.ClientSet, target mcfgv1.MachineConfigPool, stateFunc func(mcfgv1.MachineConfigPool) bool) { + t.Helper() + + maxWait := 5 * time.Minute + + end, err := pollForResourceState(maxWait, func() (bool, error) { + mcp, err := cs.MachineConfigPools().Get(context.TODO(), target.Name, metav1.GetOptions{}) + return stateFunc(*mcp), err + }) + + if err != nil { + t.Fatalf("MachineConfigPool %s did not reach expected state (took %v): %s", target.Name, end, err) + } + + t.Logf("MachineConfigPool %s reached expected state (took %v)", target.Name, end) +} + +func pollForResourceState(timeout time.Duration, pollFunc func() (bool, error)) (time.Duration, error) { + // This wraps wait.PollImmediate() for the following reason: + // + // If the control plane is temporarily unavailable (e.g., when running in a + // single-node OpenShift (SNO) context and the node reboots), this error will + // not be nil, but *should* go back to nil once the control-plane becomes + // available again. To handle that, we: + // + // 1. Store the error within the pollForResourceState scope. + // 2. Run the clock out. + // 3. Handle the error (if it does not go back to nil) when the timeout is reached. + // + // This was inspired by and is a more generic implementation of: + // https://github.com/openshift/machine-config-operator/blob/master/test/e2e-single-node/sno_mcd_test.go#L355-L374 + start := time.Now() + + var lastErr error + + waitErr := wait.PollImmediate(1*time.Second, timeout, func() (bool, error) { + result, err := pollFunc() + lastErr = err + return result, nil + }) + + return time.Since(start), kubeErrs.NewAggregate([]error{ + lastErr, + waitErr, + }) +} + +func mutateFileOnNode(t *testing.T, cs *framework.ClientSet, node corev1.Node, filename, contents string) { + t.Helper() + + if !strings.HasPrefix(filename, "/rootfs") { + filename = filepath.Join("/rootfs", filename) + } + + bashCmd := fmt.Sprintf("printf '%s' > %s", contents, filename) + t.Logf("Setting contents of %s on %s to %s", filename, node.Name, contents) + + helpers.ExecCmdOnNode(t, cs, node, "/bin/bash", "-c", bashCmd) +} + +func allMCPConditionsTrue(conditions []mcfgv1.MachineConfigPoolConditionType, mcp mcfgv1.MachineConfigPool) bool { + for _, condition := range conditions { + if !mcfgv1.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, condition) { + return false + } + } + + return true +} + +func allMCPConditionsFalse(conditions []mcfgv1.MachineConfigPoolConditionType, mcp mcfgv1.MachineConfigPool) bool { + for _, condition := range conditions { + if !mcfgv1.IsMachineConfigPoolConditionFalse(mcp.Status.Conditions, condition) { + return false + } + } + + return true +} + +func timeIt(t *testing.T, info string, timedFunc func()) { + start := time.Now() + + defer func() { + t.Logf("%s (took %v)", info, time.Since(start)) + }() + + timedFunc() +} diff --git a/test/e2e-shared-tests/mcd_config_drift.go b/test/e2e-shared-tests/mcd_config_drift.go new file mode 100644 index 0000000000..dde473eab8 --- /dev/null +++ b/test/e2e-shared-tests/mcd_config_drift.go @@ -0,0 +1,239 @@ +package e2e_shared_test + +import ( + "context" + "fmt" + "path/filepath" + "testing" + + ign3types "github.com/coreos/ignition/v2/config/v3_2/types" + mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" + "github.com/openshift/machine-config-operator/pkg/daemon/constants" + "github.com/openshift/machine-config-operator/test/framework" + "github.com/openshift/machine-config-operator/test/helpers" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/util/uuid" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + configDriftSystemdUnitFilename string = "/etc/systemd/system/unittest.service" + configDriftSystemdUnitFileContents string = "unittest-service-unit-contents" + configDriftSystemdDropinFilename string = "/etc/systemd/system/unittest.service.d/10-unittest-service.conf" + configDriftSystemdDropinFileContents string = "unittest-service-dropin-contents" + configDriftFilename string = "/etc/etc-file" + configDriftFileContents string = "expected-file-data" + configDriftMCPrefix string = "mcd-config-drift" + configDriftMonitorStartupMsg string = "Config Drift Monitor started" +) + +// This test does the following: +// 1. Creates a MachineConfig whose application is deferred to the setup +// function. This is because the setup function may need to set up other things +// such as the MachineConfigPool, label the node, etc. This allows reuse in +// both a highly available context as well as single-node context. +// 2. Mutates the file specified by the MachineConfig on the target node so its +// contents do not match what the MachineConfig specifies. +// 3. Creates a new MachineConfig to trigger an update. +// 4. Verifies that the target node and MachineConfigPool become degraded. +// 5. Mutates the file specified by the MachineConfig to make its contents +// equal to that specified by the MachineConfig. +// 6. Verifies that the target node and MachineConfigPool recover from this. +// 7. Again mutates the file specified by the MachineConfig on the target node so its +// contents do not match what the MachineConfig specifies. +// 8. Sets the forcefile on the target node. +// 9. Verifies that the target node and MachineConfigPool recover from this. +// +// The setup function is used to set up the desired MachineConfigPool (if +// needed), apply the config drift MachineConfig, wait for the +// MachineConfigPool to be available, etc. +// +// The teardown function is used to delete the MachineConfigPool (if desired), +// delete the MachineConfig, etc. + +// Holds the options used to configure the Config Drift Test +type ConfigDriftTestOpts struct { + // The target MachineConfigPool name + MCPName string + // The setup function + SetupFunc func(*mcfgv1.MachineConfig) + // The teardown function + TeardownFunc func() + // ClientSet + ClientSet *framework.ClientSet + // Skips the forcefile recovery portion + SkipForcefile bool +} + +// Holds the Config Drift Test functions. +type configDriftTest struct { + ConfigDriftTestOpts + + // The Machine Config generated for this test + mc *mcfgv1.MachineConfig + + // This is the node this test will target + node corev1.Node + + // This is the pod this test will target + pod corev1.Pod + + // This is the Machine Config Pool this test will target + mcp mcfgv1.MachineConfigPool +} + +// Does some basic setup, delegates to the attached SetupFunc, and retrieves +// the target objects for this test. +func (c *configDriftTest) Setup(t *testing.T) { + if c.SetupFunc == nil { + t.Fatalf("no setup function") + } + + if c.TeardownFunc == nil { + t.Fatalf("no teardown function") + } + + if c.MCPName == "" { + t.Fatalf("no machine config pool name") + } + + // This is the Machine Config that we drift from + c.mc = helpers.NewMachineConfigExtended( + fmt.Sprintf("%s-%s", configDriftMCPrefix, string(uuid.NewUUID())), + helpers.MCLabelForRole(c.MCPName), + []ign3types.File{ + helpers.CreateIgn3File(configDriftFilename, "data:,"+configDriftFileContents, 420), + }, + []ign3types.Unit{ + { + Name: "unittest.service", + Contents: helpers.StrToPtr(configDriftSystemdUnitFileContents), + Dropins: []ign3types.Dropin{ + { + Contents: helpers.StrToPtr(configDriftSystemdDropinFileContents), + Name: "10-unittest-service.conf", + }, + { + Name: "20-unittest-service.conf", + }, + }, + }, + }, + []ign3types.SSHAuthorizedKey{}, + []string{}, + false, + []string{"foo=bar", "foo=baz"}, + "", + "", + ) + + // Delegate to the attached Setup Func for MachineConfig, MachineConfigPool + // creation. + c.SetupFunc(c.mc) + + mcp, err := c.ClientSet.MachineConfigPools().Get(context.TODO(), c.MCPName, metav1.GetOptions{}) + require.Nil(t, err) + c.mcp = *mcp + + // Get the target node + c.node = helpers.GetSingleNodeByRole(t, c.ClientSet, c.MCPName) + + // Get the MCD pod + pod, err := helpers.MCDForNode(c.ClientSet, &c.node) + require.Nil(t, err) + + c.pod = *pod +} + +// Tears down the test objects by delegating to the attached TeardownFunc +func (c configDriftTest) Teardown(t *testing.T) { + c.TeardownFunc() +} + +// Runs the Config Drift Test +func (c configDriftTest) Run(t *testing.T) { + // Get our kernel args for future verification + kargs := getKernelArgs(t, c.ClientSet, c.node) + + // Check that we have our kernel args pre-test + assertKernelArgsEqual(t, c.ClientSet, c.node, kargs, c.mc.Spec.KernelArguments) + + testCases := []struct { + name string + testFunc func(t *testing.T) + }{ + // 1. Mutates a file on the node. + // 2. Verifies that we can recover from it by reverting the contents to + // their original state. + { + name: "revert file content recovery for ignition file", + testFunc: func(t *testing.T) { + c.runDegradeAndRecoverContentRevert(t, configDriftFilename, configDriftFileContents) + }, + }, + { + name: "revert file content recovery for systemd dropin", + testFunc: func(t *testing.T) { + c.runDegradeAndRecoverContentRevert(t, configDriftSystemdDropinFilename, configDriftSystemdDropinFileContents) + }, + }, + { + name: "revert file content recovery for systemd unit", + testFunc: func(t *testing.T) { + c.runDegradeAndRecoverContentRevert(t, configDriftSystemdUnitFilename, configDriftSystemdUnitFileContents) + }, + }, + // 1. Mutates a file on the node. + // 2. Creates the forcefile to cause recovery. + { + name: "forcefile recovery", + testFunc: func(t *testing.T) { + if c.SkipForcefile { + t.Skip() + } + + c.runDegradeAndRecover(t, configDriftFilename, configDriftFileContents, func() { + t.Logf("Setting forcefile to initiate recovery (%s)", constants.MachineConfigDaemonForceFile) + helpers.ExecCmdOnNode(t, c.ClientSet, c.node, "touch", filepath.Join("/rootfs", constants.MachineConfigDaemonForceFile)) + }) + }, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + // With the way that the Config Drift Monitor is wired into the MCD, + // "machineconfiguration.openshift.io/state" gets set to "Done" before the + // Config Drift Monitor is started. + waitForConfigDriftMonitorStart(t, c.ClientSet, c.node) + + testCase.testFunc(t) + + // Ensure that we have our kernel args post-test + assertKernelArgsEqual(t, c.ClientSet, c.node, kargs, c.mc.Spec.KernelArguments) + }) + } +} + +func (c configDriftTest) runDegradeAndRecoverContentRevert(t *testing.T, filename, expectedContents string) { + c.runDegradeAndRecover(t, filename, expectedContents, func() { + t.Logf("Reverting %s to expected contents to initiate recovery", filename) + mutateFileOnNode(t, c.ClientSet, c.node, filename, expectedContents) + }) +} + +func (c configDriftTest) runDegradeAndRecover(t *testing.T, filename, expectedFileContents string, recoverFunc func()) { + mutateFileOnNode(t, c.ClientSet, c.node, filename, "not-the-data") + defer mutateFileOnNode(t, c.ClientSet, c.node, filename, expectedFileContents) + + // Ensure that the node and MCP reach a degraded state before we recover. + assertNodeAndMCPIsDegraded(t, c.ClientSet, c.node, c.mcp, filename) + + // Run the recovery function. + recoverFunc() + + // Verify that the node and MCP recover. + assertNodeAndMCPIsRecovered(t, c.ClientSet, c.node, c.mcp) +} diff --git a/test/e2e-shared-tests/shared.go b/test/e2e-shared-tests/shared.go new file mode 100644 index 0000000000..1e18946f12 --- /dev/null +++ b/test/e2e-shared-tests/shared.go @@ -0,0 +1,48 @@ +package e2e_shared_test + +import ( + "testing" +) + +// Options for all shared tests +type SharedTestOpts struct { + ConfigDriftTestOpts ConfigDriftTestOpts +} + +// All shared tests must define this interface +type SharedTest interface { + Setup(t *testing.T) + Run(t *testing.T) + Teardown(t *testing.T) +} + +// Runs all of the shared tests +func Run(t *testing.T, opts SharedTestOpts) { + testCases := []struct { + name string + sharedTest SharedTest + }{ + { + name: "MCD Config Drift", + sharedTest: &configDriftTest{ + ConfigDriftTestOpts: opts.ConfigDriftTestOpts, + }, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + defer timeIt(t, "Teardown complete", func() { + t.Log("Starting teardown") + testCase.sharedTest.Teardown(t) + }) + + timeIt(t, "Setup complete", func() { + t.Log("Starting setup") + testCase.sharedTest.Setup(t) + }) + + testCase.sharedTest.Run(t) + }) + } +} diff --git a/test/e2e-single-node/sno_mcd_test.go b/test/e2e-single-node/sno_mcd_test.go index e5eb8d6c21..44cb502f56 100644 --- a/test/e2e-single-node/sno_mcd_test.go +++ b/test/e2e-single-node/sno_mcd_test.go @@ -9,6 +9,7 @@ import ( "time" "github.com/openshift/machine-config-operator/pkg/daemon/constants" + e2eShared "github.com/openshift/machine-config-operator/test/e2e-shared-tests" "github.com/openshift/machine-config-operator/test/framework" "github.com/openshift/machine-config-operator/test/helpers" "github.com/pkg/errors" @@ -352,6 +353,47 @@ func TestNoReboot(t *testing.T) { t.Logf("Node %s didn't reboot as expected during rollback, uptime increased from %f to %f ", node.Name, uptimeOld, uptimeNew) } +func TestRunShared(t *testing.T) { + mcpName := "master" + + cs := framework.NewClientSet("") + + cleanupFuncs := helpers.NewCleanupFuncs() + + oldWorkerMCName := helpers.GetMcName(t, cs, mcpName) + + configOpts := e2eShared.ConfigDriftTestOpts{ + MCPName: mcpName, + ClientSet: cs, + SkipForcefile: true, + SetupFunc: func(mc *mcfgv1.MachineConfig) { + // Apply our MachineConfig and store the returned cleanup func + cleanupFuncs.Add(helpers.ApplyMC(t, cs, mc)) + + // Wait for the config to be rendered + renderedConfigName, err := helpers.WaitForRenderedConfig(t, cs, mcpName, mc.Name) + require.Nil(t, err) + + // Wait for the config to be rolled out + require.Nil(t, waitForSingleNodePoolComplete(t, cs, mcpName, renderedConfigName)) + + cleanupFuncs.Add(func() { + // Wait for the pool to catch up + require.Nil(t, waitForSingleNodePoolComplete(t, cs, mcpName, oldWorkerMCName)) + }) + }, + TeardownFunc: func() { + cleanupFuncs.Run() + }, + } + + sharedOpts := e2eShared.SharedTestOpts{ + ConfigDriftTestOpts: configOpts, + } + + e2eShared.Run(t, sharedOpts) +} + func waitForSingleNodePoolComplete(t *testing.T, cs *framework.ClientSet, pool, target string) error { var lastErr error startTime := time.Now() diff --git a/test/e2e/mcd_test.go b/test/e2e/mcd_test.go index e5cec743db..365a3ebd94 100644 --- a/test/e2e/mcd_test.go +++ b/test/e2e/mcd_test.go @@ -13,6 +13,7 @@ import ( "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/uuid" @@ -21,6 +22,7 @@ import ( mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" "github.com/openshift/machine-config-operator/pkg/daemon/constants" + e2eShared "github.com/openshift/machine-config-operator/test/e2e-shared-tests" "github.com/openshift/machine-config-operator/test/framework" "github.com/openshift/machine-config-operator/test/helpers" ) @@ -78,6 +80,30 @@ func TestMCDeployed(t *testing.T) { } } +func TestRunShared(t *testing.T) { + mcpName := "infra" + + cleanupFuncs := helpers.NewCleanupFuncs() + cs := framework.NewClientSet("") + + configOpts := e2eShared.ConfigDriftTestOpts{ + ClientSet: cs, + MCPName: mcpName, + SetupFunc: func(mc *mcfgv1.MachineConfig) { + cleanupFuncs.Add(helpers.CreatePoolAndApplyMC(t, cs, mcpName, mc)) + }, + TeardownFunc: func() { + cleanupFuncs.Run() + }, + } + + sharedOpts := e2eShared.SharedTestOpts{ + ConfigDriftTestOpts: configOpts, + } + + e2eShared.Run(t, sharedOpts) +} + func TestKernelArguments(t *testing.T) { cs := framework.NewClientSet("") @@ -751,7 +777,6 @@ func TestIgn3Cfg(t *testing.T) { } err = helpers.WaitForPoolComplete(t, cs, "infra", renderedConfig) require.Nil(t, err) - } func createMCToAddFileForRole(name, role, filename, data string) *mcfgv1.MachineConfig { diff --git a/test/framework/clientset.go b/test/framework/clientset.go index 702902b1b4..ebd029c9c5 100644 --- a/test/framework/clientset.go +++ b/test/framework/clientset.go @@ -1,6 +1,7 @@ package framework import ( + "fmt" "os" "github.com/golang/glog" @@ -21,6 +22,15 @@ type ClientSet struct { clientmachineconfigv1.MachineconfigurationV1Interface clientapiextensionsv1.ApiextensionsV1Interface clientoperatorsv1alpha1.OperatorV1alpha1Interface + kubeconfig string +} + +func (cs *ClientSet) GetKubeconfig() (string, error) { + if cs.kubeconfig != "" { + return cs.kubeconfig, nil + } + + return "", fmt.Errorf("no kubeconfig found; are you running a custom config or in-cluster?") } // NewClientSet returns a *ClientBuilder with the given kubeconfig. @@ -43,7 +53,9 @@ func NewClientSet(kubeconfig string) *ClientSet { panic(err) } - return NewClientSetFromConfig(config) + cs := NewClientSetFromConfig(config) + cs.kubeconfig = kubeconfig + return cs } // NewClientSetFromConfig returns a *ClientBuilder with the given rest config. diff --git a/test/helpers/utils.go b/test/helpers/utils.go index 388cb25858..cdebb02fc8 100644 --- a/test/helpers/utils.go +++ b/test/helpers/utils.go @@ -11,6 +11,7 @@ import ( ign3types "github.com/coreos/ignition/v2/config/v3_2/types" mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" + "github.com/openshift/machine-config-operator/pkg/daemon/constants" "github.com/openshift/machine-config-operator/test/framework" "github.com/pkg/errors" "github.com/stretchr/testify/require" @@ -18,9 +19,82 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/wait" ) +type CleanupFuncs struct { + funcs []func() +} + +func (c *CleanupFuncs) Add(f func()) { + c.funcs = append(c.funcs, f) +} + +func (c *CleanupFuncs) Run() { + for _, f := range c.funcs { + f() + } +} + +func NewCleanupFuncs() CleanupFuncs { + return CleanupFuncs{ + funcs: []func(){}, + } +} + +func ApplyMC(t *testing.T, cs *framework.ClientSet, mc *mcfgv1.MachineConfig) func() { + _, err := cs.MachineConfigs().Create(context.TODO(), mc, metav1.CreateOptions{}) + require.Nil(t, err) + + return func() { + require.Nil(t, cs.MachineConfigs().Delete(context.TODO(), mc.Name, metav1.DeleteOptions{})) + } +} + +func CreatePoolAndApplyMC(t *testing.T, cs *framework.ClientSet, poolName string, mc *mcfgv1.MachineConfig) func() { + workerMCPName := "worker" + + t.Logf("Setting up pool %s", poolName) + + unlabelFunc := LabelRandomNodeFromPool(t, cs, workerMCPName, MCPNameToRole(poolName)) + deleteMCPFunc := CreateMCP(t, cs, poolName) + + node := GetSingleNodeByRole(t, cs, poolName) + + t.Logf("Target Node: %s", node.Name) + + mcDeleteFunc := ApplyMC(t, cs, mc) + + WaitForConfigAndPoolComplete(t, cs, poolName, mc.Name) + + mcpMCName := GetMcName(t, cs, poolName) + require.Nil(t, WaitForNodeConfigChange(t, cs, node, mcpMCName)) + + return func() { + t.Logf("Cleaning up MCP %s", poolName) + t.Logf("Removing label %s from node %s", MCPNameToRole(poolName), node.Name) + unlabelFunc() + + workerMC := GetMcName(t, cs, workerMCPName) + + // Wait for the worker pool to catch up with the deleted label + time.Sleep(5 * time.Second) + + t.Logf("Waiting for %s pool to finish applying %s", workerMCPName, workerMC) + require.Nil(t, WaitForPoolComplete(t, cs, workerMCPName, workerMC)) + + t.Logf("Ensuring node has %s pool config before deleting pool %s", workerMCPName, poolName) + require.Nil(t, WaitForNodeConfigChange(t, cs, node, workerMC)) + + t.Logf("Deleting MCP %s", poolName) + deleteMCPFunc() + + t.Logf("Deleting MachineConfig %s", mc.Name) + mcDeleteFunc() + } +} + // GetMcName returns the current configuration name of the machine config pool poolName func GetMcName(t *testing.T, cs *framework.ClientSet, poolName string) string { // grab the initial machineconfig used by the worker pool @@ -120,6 +194,30 @@ func WaitForPoolComplete(t *testing.T, cs *framework.ClientSet, pool, target str return nil } +func WaitForNodeConfigChange(t *testing.T, cs *framework.ClientSet, node corev1.Node, mcName string) error { + startTime := time.Now() + err := wait.PollImmediate(2*time.Second, 20*time.Minute, func() (bool, error) { + n, err := cs.Nodes().Get(context.TODO(), node.Name, metav1.GetOptions{}) + if err != nil { + return false, err + } + + current := n.Annotations[constants.CurrentMachineConfigAnnotationKey] + desired := n.Annotations[constants.DesiredMachineConfigAnnotationKey] + + state := n.Annotations[constants.MachineConfigDaemonStateAnnotationKey] + + return current == desired && desired == mcName && state == constants.MachineConfigDaemonStateDone, nil + }) + + if err != nil { + return fmt.Errorf("node config change did not occur (waited %v): %w", time.Since(startTime), err) + } + + t.Logf("Node %s changed config to %s (waited %v)", node.Name, mcName, time.Since(startTime)) + return nil +} + // LabelRandomNodeFromPool gets all nodes in pool and chooses one at random to label func LabelRandomNodeFromPool(t *testing.T, cs *framework.ClientSet, pool, label string) func() { nodes, err := GetNodesByRole(cs, pool) @@ -131,11 +229,19 @@ func LabelRandomNodeFromPool(t *testing.T, cs *framework.ClientSet, pool, label // G404: Use of weak random number generator (math/rand instead of crypto/rand) // #nosec infraNode := nodes[rand.Intn(len(nodes))] - out, err := exec.Command("oc", "label", "node", infraNode.Name, label+"=", "--overwrite=true").CombinedOutput() - require.Nil(t, err, "unable to label worker node %s with infra: %s", infraNode.Name, string(out)) + infraNode.Labels[label] = "" + + _, err = cs.Nodes().Update(context.TODO(), &infraNode, metav1.UpdateOptions{}) + + require.Nil(t, err, "unable to label worker node %s with infra: %s", infraNode.Name, err) return func() { - out, err = exec.Command("oc", "label", "node", infraNode.Name, label+"-").CombinedOutput() - require.Nil(t, err, "unable to remove label from node %s: %s", infraNode.Name, string(out)) + updatedNode, err := cs.Nodes().Get(context.TODO(), infraNode.Name, metav1.GetOptions{}) + require.Nil(t, err, "unable to get node to update: %s", err) + + delete(updatedNode.Labels, label) + + _, err = cs.Nodes().Update(context.TODO(), updatedNode, metav1.UpdateOptions{}) + require.Nil(t, err, "unable to remove label from node %s: %s", infraNode.Name, err) } } @@ -201,11 +307,23 @@ func CreateMC(name, role string) *mcfgv1.MachineConfig { // ExecCmdOnNode finds a node's mcd, and oc rsh's into it to execute a command on the node // all commands should use /rootfs as root func ExecCmdOnNode(t *testing.T, cs *framework.ClientSet, node corev1.Node, subArgs ...string) string { + // Check for an oc binary in $PATH. + path, err := exec.LookPath("oc") + if err != nil { + t.Fatalf("could not locate oc command: %s", err) + } + + // Get the kubeconfig file path + kubeconfig, err := cs.GetKubeconfig() + if err != nil { + t.Fatalf("could not get kubeconfig: %s", err) + } + mcd, err := mcdForNode(cs, &node) require.Nil(t, err) mcdName := mcd.ObjectMeta.Name - entryPoint := "oc" + entryPoint := path args := []string{"rsh", "-n", "openshift-machine-config-operator", "-c", "machine-config-daemon", @@ -213,6 +331,10 @@ func ExecCmdOnNode(t *testing.T, cs *framework.ClientSet, node corev1.Node, subA args = append(args, subArgs...) cmd := exec.Command(entryPoint, args...) + // If one passes a path to a kubeconfig via NewClientSet instead of setting + // $KUBECONFIG, oc will be unaware of it. To remedy, we explicitly set + // KUBECONFIG to the value held by the clientset. + cmd.Env = append(cmd.Env, "KUBECONFIG="+kubeconfig) cmd.Stderr = os.Stderr out, err := cmd.Output() @@ -278,6 +400,10 @@ func CreateIgn3File(path, content string, mode int) ign3types.File { } } +func MCDForNode(cs *framework.ClientSet, node *corev1.Node) (*corev1.Pod, error) { + return mcdForNode(cs, node) +} + func mcdForNode(cs *framework.ClientSet, node *corev1.Node) (*corev1.Pod, error) { // find the MCD pod that has spec.nodeNAME = node.Name and get its name: listOptions := metav1.ListOptions{ diff --git a/vendor/modules.txt b/vendor/modules.txt index 0c5f7aeee2..a42be14a9d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -275,6 +275,7 @@ github.com/fatih/color # github.com/fatih/structtag v1.2.0 github.com/fatih/structtag # github.com/fsnotify/fsnotify v1.4.9 +## explicit github.com/fsnotify/fsnotify # github.com/fzipp/gocyclo v0.3.1 github.com/fzipp/gocyclo