From 804af8c35eedbf5c1f6edb3004a1f047f76cba2a Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Wed, 21 Dec 2022 16:58:00 -0500 Subject: [PATCH 1/5] Remove unnecessary warning All errors produced by this function are accompanied by Fatal-level logs, so there is no need for an additional warning-level log. --- pkg/agent/waitfor.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/agent/waitfor.go b/pkg/agent/waitfor.go index 3e852d4d797..fcf19703849 100644 --- a/pkg/agent/waitfor.go +++ b/pkg/agent/waitfor.go @@ -16,7 +16,6 @@ func WaitForBootstrapComplete(assetDir string) (*Cluster, error) { ctx := context.Background() cluster, err := NewCluster(ctx, assetDir) if err != nil { - logrus.Warn("unable to make cluster object to track installation") return nil, err } From 596f1622feb5a3a428747d917ca98a786b692a35 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 22 Dec 2022 10:02:55 -0500 Subject: [PATCH 2/5] Remove dead code err is always nil at this point, because we check it further up and it is not overwritten by the variable of the same name that is shadowing it inside the anonymous function, as was probably intended. --- pkg/agent/waitfor.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/agent/waitfor.go b/pkg/agent/waitfor.go index fcf19703849..29391b9f754 100644 --- a/pkg/agent/waitfor.go +++ b/pkg/agent/waitfor.go @@ -90,9 +90,6 @@ func WaitForInstallComplete(assetDir string) (*Cluster, error) { waitErr := waitContext.Err() if waitErr != nil && waitErr != context.Canceled { - if err != nil { - return cluster, errors.Wrap(err, "Error occurred during installation") - } return cluster, errors.Wrap(waitErr, "Cluster installation timed out") } return cluster, nil From 48058f9cb93ee747c885abd67a29d96fddc96835 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Wed, 21 Dec 2022 17:04:39 -0500 Subject: [PATCH 3/5] Refactor agent wait-for commands Create the Cluster object outside of the WaitFor...() implementation and pass it in, instead of creating it inside and returning it. --- cmd/openshift-install/agent/waitfor.go | 18 ++++++++++++++++-- pkg/agent/waitfor.go | 24 ++++++++---------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/cmd/openshift-install/agent/waitfor.go b/cmd/openshift-install/agent/waitfor.go index e8d05eb8eb5..cca8b76008f 100644 --- a/cmd/openshift-install/agent/waitfor.go +++ b/cmd/openshift-install/agent/waitfor.go @@ -1,6 +1,8 @@ package agent import ( + "context" + "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/spf13/cobra" @@ -42,8 +44,14 @@ func newWaitForBootstrapCompleteCmd() *cobra.Command { if len(assetDir) == 0 { logrus.Fatal("No cluster installation directory found") } - cluster, err := agentpkg.WaitForBootstrapComplete(assetDir) + + ctx := context.Background() + cluster, err := agentpkg.NewCluster(ctx, assetDir) if err != nil { + logrus.Exit(exitCodeBootstrapFailed) + } + + if err := agentpkg.WaitForBootstrapComplete(cluster); err != nil { logrus.Debug("Printing the event list gathered from the Agent Rest API") cluster.PrintInfraEnvRestAPIEventList() err2 := cluster.API.OpenShift.LogClusterOperatorConditions() @@ -70,8 +78,14 @@ func newWaitForInstallCompleteCmd() *cobra.Command { if len(assetDir) == 0 { logrus.Fatal("No cluster installation directory found") } - cluster, err := agentpkg.WaitForInstallComplete(assetDir) + + ctx := context.Background() + cluster, err := agentpkg.NewCluster(ctx, assetDir) if err != nil { + logrus.Exit(exitCodeBootstrapFailed) + } + + if err = agentpkg.WaitForInstallComplete(cluster); err != nil { logrus.Debug("Printing the event list gathered from the Agent Rest API") cluster.PrintInfraEnvRestAPIEventList() err2 := cluster.API.OpenShift.LogClusterOperatorConditions() diff --git a/pkg/agent/waitfor.go b/pkg/agent/waitfor.go index 29391b9f754..d352a9ed515 100644 --- a/pkg/agent/waitfor.go +++ b/pkg/agent/waitfor.go @@ -11,14 +11,7 @@ import ( // WaitForBootstrapComplete Wait for the bootstrap process to complete on // cluster installations triggered by the agent installer. -func WaitForBootstrapComplete(assetDir string) (*Cluster, error) { - - ctx := context.Background() - cluster, err := NewCluster(ctx, assetDir) - if err != nil { - return nil, err - } - +func WaitForBootstrapComplete(cluster *Cluster) error { start := time.Now() previous := time.Now() timeout := 60 * time.Minute @@ -55,22 +48,21 @@ func WaitForBootstrapComplete(assetDir string) (*Cluster, error) { waitErr := waitContext.Err() if waitErr != nil { if waitErr == context.Canceled && lastErr != nil { - return cluster, errors.Wrap(lastErr, "bootstrap process returned error") + return errors.Wrap(lastErr, "bootstrap process returned error") } if waitErr == context.DeadlineExceeded { - return cluster, errors.Wrap(waitErr, "bootstrap process timed out") + return errors.Wrap(waitErr, "bootstrap process timed out") } } - return cluster, nil + return nil } // WaitForInstallComplete Waits for the cluster installation triggered by the // agent installer to be complete. -func WaitForInstallComplete(assetDir string) (*Cluster, error) { - - cluster, err := WaitForBootstrapComplete(assetDir) +func WaitForInstallComplete(cluster *Cluster) error { + err := WaitForBootstrapComplete(cluster) if err != nil { return cluster, errors.Wrap(err, "error occured during bootstrap process") } @@ -90,7 +82,7 @@ func WaitForInstallComplete(assetDir string) (*Cluster, error) { waitErr := waitContext.Err() if waitErr != nil && waitErr != context.Canceled { - return cluster, errors.Wrap(waitErr, "Cluster installation timed out") + return errors.Wrap(waitErr, "Cluster installation timed out") } - return cluster, nil + return nil } From 9b613cf8166df2444002256ff8ebb71a052b7922 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Wed, 21 Dec 2022 16:48:42 -0500 Subject: [PATCH 4/5] OCPBUGS-3706: Don't report bootstrap errors as install errors When running the 'agent wait-for install-complete' command, we first check that bootstrapping is complete (by running the equivalent of 'agent wait-for bootstrap-complete'. However, if this failed because the bootstrapping timed out, we would report it as an install failure along with the corresponding debug messages (stating that the problem is with the cluster operators, and inevitably failing to fetch data about which). If the failure occurs during bootstrapping, report it as a bootstrap error the same as you would get from 'agent wait-for bootstrap-complete'. --- cmd/openshift-install/agent/waitfor.go | 30 +++++++++++++++----------- pkg/agent/waitfor.go | 6 ------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cmd/openshift-install/agent/waitfor.go b/cmd/openshift-install/agent/waitfor.go index cca8b76008f..2219c688e9e 100644 --- a/cmd/openshift-install/agent/waitfor.go +++ b/cmd/openshift-install/agent/waitfor.go @@ -33,6 +33,19 @@ func NewWaitForCmd() *cobra.Command { return cmd } +func handleBootstrapError(cluster *agentpkg.Cluster, err error) { + logrus.Debug("Printing the event list gathered from the Agent Rest API") + cluster.PrintInfraEnvRestAPIEventList() + err2 := cluster.API.OpenShift.LogClusterOperatorConditions() + if err2 != nil { + logrus.Error("Attempted to gather ClusterOperator status after wait failure: ", err2) + } + logrus.Info("Use the following commands to gather logs from the cluster") + logrus.Info("openshift-install gather bootstrap --help") + logrus.Error(errors.Wrap(err, "Bootstrap failed to complete: ")) + logrus.Exit(exitCodeBootstrapFailed) +} + func newWaitForBootstrapCompleteCmd() *cobra.Command { return &cobra.Command{ Use: "bootstrap-complete", @@ -52,16 +65,7 @@ func newWaitForBootstrapCompleteCmd() *cobra.Command { } if err := agentpkg.WaitForBootstrapComplete(cluster); err != nil { - logrus.Debug("Printing the event list gathered from the Agent Rest API") - cluster.PrintInfraEnvRestAPIEventList() - err2 := cluster.API.OpenShift.LogClusterOperatorConditions() - if err2 != nil { - logrus.Error("Attempted to gather ClusterOperator status after wait failure: ", err2) - } - logrus.Info("Use the following commands to gather logs from the cluster") - logrus.Info("openshift-install gather bootstrap --help") - logrus.Error(errors.Wrap(err, "Bootstrap failed to complete: ")) - logrus.Exit(exitCodeBootstrapFailed) + handleBootstrapError(cluster, err) } }, } @@ -85,9 +89,11 @@ func newWaitForInstallCompleteCmd() *cobra.Command { logrus.Exit(exitCodeBootstrapFailed) } + if err := agentpkg.WaitForBootstrapComplete(cluster); err != nil { + handleBootstrapError(cluster, err) + } + if err = agentpkg.WaitForInstallComplete(cluster); err != nil { - logrus.Debug("Printing the event list gathered from the Agent Rest API") - cluster.PrintInfraEnvRestAPIEventList() err2 := cluster.API.OpenShift.LogClusterOperatorConditions() if err2 != nil { logrus.Error("Attempted to gather ClusterOperator status after wait failure: ", err2) diff --git a/pkg/agent/waitfor.go b/pkg/agent/waitfor.go index d352a9ed515..aa3df6e870f 100644 --- a/pkg/agent/waitfor.go +++ b/pkg/agent/waitfor.go @@ -61,12 +61,6 @@ func WaitForBootstrapComplete(cluster *Cluster) error { // WaitForInstallComplete Waits for the cluster installation triggered by the // agent installer to be complete. func WaitForInstallComplete(cluster *Cluster) error { - - err := WaitForBootstrapComplete(cluster) - if err != nil { - return cluster, errors.Wrap(err, "error occured during bootstrap process") - } - timeout := 90 * time.Minute waitContext, cancel := context.WithTimeout(cluster.Ctx, timeout) defer cancel() From 6f93877d58517e9f2138adb976e23a687d8e73a7 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 22 Dec 2022 11:53:08 -0500 Subject: [PATCH 5/5] Log timeout when install does not complete --- cmd/openshift-install/agent/waitfor.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/openshift-install/agent/waitfor.go b/cmd/openshift-install/agent/waitfor.go index 2219c688e9e..c92a2161853 100644 --- a/cmd/openshift-install/agent/waitfor.go +++ b/cmd/openshift-install/agent/waitfor.go @@ -94,6 +94,7 @@ func newWaitForInstallCompleteCmd() *cobra.Command { } if err = agentpkg.WaitForInstallComplete(cluster); err != nil { + logrus.Error(err) err2 := cluster.API.OpenShift.LogClusterOperatorConditions() if err2 != nil { logrus.Error("Attempted to gather ClusterOperator status after wait failure: ", err2)