diff --git a/cmd/machine-config-daemon/pivot.go b/cmd/machine-config-daemon/pivot.go index 8ef3448818..00247c376f 100644 --- a/cmd/machine-config-daemon/pivot.go +++ b/cmd/machine-config-daemon/pivot.go @@ -7,6 +7,7 @@ import ( "io/ioutil" "os" "os/exec" + "path/filepath" "strings" // Enable sha256 in container image references @@ -183,7 +184,7 @@ func updateTuningArgs(tuningFilePath, cmdLinePath string) (bool, error) { return changed, nil } -func run(_ *cobra.Command, args []string) error { +func run(_ *cobra.Command, args []string) (retErr error) { flag.Set("logtostderr", "true") flag.Parse() @@ -241,6 +242,10 @@ func Execute(cmd *cobra.Command, args []string) { err := run(cmd, args) if err != nil { fmt.Printf("error: %v\n", err) + os.MkdirAll(filepath.Dir(types.PivotFailurePath), 0755) + // write a pivot failure file that we'll read from MCD since we start this with systemd + // and we just follow logs + ioutil.WriteFile(types.PivotFailurePath, []byte(err.Error()), 0644) os.Exit(1) } } diff --git a/pkg/controller/node/node_controller.go b/pkg/controller/node/node_controller.go index 3a672cbe92..a47597a5a6 100644 --- a/pkg/controller/node/node_controller.go +++ b/pkg/controller/node/node_controller.go @@ -733,17 +733,26 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error { nodes, err := ctrl.getNodesForPool(pool) if err != nil { + if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil { + return goerrs.Wrapf(err, "error getting nodes for pool %q, sync error: %v", pool.Name, syncErr) + } return err } maxunavail, err := maxUnavailable(pool, nodes) if err != nil { + if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil { + return goerrs.Wrapf(err, "error getting max unavailable count for pool %q, sync error: %v", pool.Name, syncErr) + } return err } candidates := getCandidateMachines(pool, nodes, maxunavail) for _, node := range candidates { if err := ctrl.setDesiredMachineConfigAnnotation(node.Name, pool.Spec.Configuration.Name); err != nil { + if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil { + return goerrs.Wrapf(err, "error setting desired machine config annotation for node %q, pool %q, sync error: %v", node.Name, pool.Name, syncErr) + } return err } } diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 5b40332535..e7c682f034 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -1272,12 +1272,12 @@ func (dn *Daemon) validateOnDiskState(currentConfig *mcfgv1.MachineConfig) error return err } if !osMatch { - return errors.Errorf("expected target osImageURL %s", currentConfig.Spec.OSImageURL) + return errors.Errorf("expected target osImageURL %q, have %q", currentConfig.Spec.OSImageURL, dn.bootedOSImageURL) } // And the rest of the disk state currentIgnConfig, report, err := ign.Parse(currentConfig.Spec.Config.Raw) if err != nil { - return errors.Errorf("Failed to parse Ignition for validation: %s\nReport: %v", err, report) + return errors.Errorf("failed to parse Ignition for validation: %s\nReport: %v", err, report) } if err := checkFiles(currentIgnConfig.Storage.Files); err != nil { return err diff --git a/pkg/daemon/pivot/types/imageinspection.go b/pkg/daemon/pivot/types/imageinspection.go index 12cd1e3929..0aa5d76107 100644 --- a/pkg/daemon/pivot/types/imageinspection.go +++ b/pkg/daemon/pivot/types/imageinspection.go @@ -3,4 +3,9 @@ package types const ( // PivotNamePrefix is literally the name prefix of the new pivot PivotNamePrefix = "ostree-container-pivot-" + + // PivotFailurePath is the path pivot writes its error to when it fails. We need this to bubble + // up any errors since we only log those errors in MCD. If we don't do this then the error messages + // don't get visibility up the stack. + PivotFailurePath = "/run/pivot/failure" ) diff --git a/pkg/daemon/pivot/utils/run.go b/pkg/daemon/pivot/utils/run.go index 130d4e13e0..34193809e8 100644 --- a/pkg/daemon/pivot/utils/run.go +++ b/pkg/daemon/pivot/utils/run.go @@ -2,58 +2,63 @@ package utils import ( "bytes" + "io" "os" "os/exec" "strings" "time" "github.com/golang/glog" + "github.com/pkg/errors" "k8s.io/apimachinery/pkg/util/wait" ) // runImpl is the actual shell execution implementation used by other functions. -func runImpl(capture bool, command string, args ...string) ([]byte, error) { +func runImpl(command string, args ...string) ([]byte, error) { glog.Infof("Running: %s %s\n", command, strings.Join(args, " ")) cmd := exec.Command(command, args...) - cmd.Stderr = os.Stderr - var stdout bytes.Buffer - if !capture { - cmd.Stdout = os.Stdout - } else { - cmd.Stdout = &stdout - } + // multiplex writes to std streams so we keep seeing logs in MCD/systemd + // but we'll still be able to give out something here + var b bytes.Buffer + stderr := io.MultiWriter(os.Stderr, &b) + stdout := io.MultiWriter(os.Stdout, &b) + cmd.Stderr = stderr + cmd.Stdout = stdout err := cmd.Run() if err != nil { - return nil, err - } - if capture { - return stdout.Bytes(), nil + return nil, errors.Wrapf(err, "running %s %s failed: %s", command, strings.Join(args, " "), b.String()) } - return []byte{}, nil + return b.Bytes(), nil } // runExtBackoff is an extension to runExt that supports configuring retries/duration/backoff. -func runExtBackoff(capture bool, backoff wait.Backoff, command string, args ...string) string { - var output string - err := wait.ExponentialBackoff(backoff, func() (bool, error) { - if out, e := runImpl(capture, command, args...); e != nil { - glog.Warningf("%s failed: %v; retrying...", command, e) +func runExtBackoff(backoff wait.Backoff, command string, args ...string) (string, error) { + var ( + output string + lastErr error + ) + if err := wait.ExponentialBackoff(backoff, func() (bool, error) { + out, err := runImpl(command, args...) + if err != nil { + lastErr = err + glog.Warningf("%s failed: %v; retrying...", command, err) return false, nil - } else if capture { - output = strings.TrimSpace(string(out)) } + output = strings.TrimSpace(string(out)) return true, nil - }) - if err != nil { - glog.Fatalf("%s: %s", command, err) + }); err != nil { + if err == wait.ErrWaitTimeout { + return "", errors.Wrapf(lastErr, "failed to run command %s (%d tries): %v", command, backoff.Steps, err) + } + return "", errors.Wrap(err, "failed to run command %s (%d tries): %v") } - return output + return output, nil } // RunExt executes a command, optionally capturing the output and retrying multiple // times before exiting with a fatal error. -func RunExt(capture bool, retries int, command string, args ...string) string { - return runExtBackoff(capture, wait.Backoff{ +func RunExt(retries int, command string, args ...string) (string, error) { + return runExtBackoff(wait.Backoff{ Steps: retries + 1, // times to try Duration: 5 * time.Second, // sleep between tries Factor: 2, // factor by which to increase sleep diff --git a/pkg/daemon/pivot/utils/run_test.go b/pkg/daemon/pivot/utils/run_test.go index befeb2e426..0fc714a702 100644 --- a/pkg/daemon/pivot/utils/run_test.go +++ b/pkg/daemon/pivot/utils/run_test.go @@ -13,16 +13,13 @@ import ( // TestRunExt verifies that the wait machinery works, even though we're only // just testing a single step here since it's tricky to test retries. func TestRunExt(t *testing.T) { - RunExt(false, 0, "echo", "echo", "from", "TestRunExt") - - result := RunExt(true, 0, "echo", "hello", "world") - assert.Equal(t, "hello world", result) + RunExt(0, "echo", "echo", "from", "TestRunExt") tmpdir, err := ioutil.TempDir("", "run_test") assert.Nil(t, err) defer os.RemoveAll(tmpdir) tmpf := tmpdir + "/t" - runExtBackoff(false, wait.Backoff{Steps: 6, + runExtBackoff(wait.Backoff{Steps: 6, Duration: 1 * time.Second, Factor: 1.1}, "sh", "-c", "printf x >> "+tmpf+" && test $(wc -c < "+tmpf+") = 3") diff --git a/pkg/daemon/rpm-ostree.go b/pkg/daemon/rpm-ostree.go index f297fdaaf9..ad50c3d8b6 100644 --- a/pkg/daemon/rpm-ostree.go +++ b/pkg/daemon/rpm-ostree.go @@ -172,7 +172,10 @@ func (r *RpmOstreeClient) PullAndRebase(container string, keep bool) (imgid stri args := []string{"pull", "-q"} args = append(args, authArgs...) args = append(args, container) - pivotutils.RunExt(false, numRetriesNetCommands, "podman", args...) + _, err = pivotutils.RunExt(numRetriesNetCommands, "podman", args...) + if err != nil { + return + } } else { if previousPivot != "" { var targetMatched bool @@ -190,7 +193,10 @@ func (r *RpmOstreeClient) PullAndRebase(container string, keep bool) (imgid stri args := []string{"pull", "-q"} args = append(args, authArgs...) args = append(args, container) - pivotutils.RunExt(false, numRetriesNetCommands, "podman", args...) + _, err = pivotutils.RunExt(numRetriesNetCommands, "podman", args...) + if err != nil { + return + } } inspectArgs := []string{"inspect", "--type=image"} diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index f16b838038..a9c27eca59 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -33,6 +33,7 @@ import ( mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" "github.com/openshift/machine-config-operator/pkg/daemon/constants" + pivottypes "github.com/openshift/machine-config-operator/pkg/daemon/pivot/types" pivotutils "github.com/openshift/machine-config-operator/pkg/daemon/pivot/utils" ) @@ -658,7 +659,10 @@ func (dn *Daemon) mountOSContainer(container string) (mnt, containerName string, args := []string{"pull", "-q"} args = append(args, authArgs...) args = append(args, container) - pivotutils.RunExt(false, numRetriesNetCommands, "podman", args...) + _, err = pivotutils.RunExt(numRetriesNetCommands, "podman", args...) + if err != nil { + return + } containerName = "mcd-" + string(uuid.NewUUID()) // `podman mount` wants a container, so let's create a dummy one, but not run it @@ -1285,7 +1289,11 @@ func (dn *Daemon) updateOS(config *mcfgv1.MachineConfig) error { if dn.kubeClient != nil { if err := dn.NodeUpdaterClient.RunPivot(newURL); err != nil { MCDPivotErr.WithLabelValues(newURL, err.Error()).SetToCurrentTime() - return fmt.Errorf("failed to run pivot: %v", err) + pivotErr, err2 := ioutil.ReadFile(pivottypes.PivotFailurePath) + if err2 != nil || len(pivotErr) == 0 { + glog.Warningf("pivot error file doesn't contain anything, it was never written or an error occurred: %v", err2) + } + return fmt.Errorf("failed to run pivot: %v: %v", err, string(pivotErr)) } } else { // If we're here we're invoked via `machine-config-daemon-firstboot.service`, so let's @@ -1297,7 +1305,7 @@ func (dn *Daemon) updateOS(config *mcfgv1.MachineConfig) error { } if !changed { // This really shouldn't happen - glog.Warningf("Didn't change when updating to %s ?", newURL) + glog.Warningf("Didn't change when updating to %q?", newURL) } }