From 2c82a98d5d7fffe395eb1bf0bab1f540506db49d Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Wed, 24 Apr 2024 14:24:07 -0400 Subject: [PATCH 1/2] capi: add timeout when provisioning Adds a 15m timeout to the infrastructure provisioning and machine provisioning stages of CAPI, so that the controllers do not spin indefinitely in the case of a failure. 15m is an arbitrary value, but the criteria for the timeout should be based on the balance of ample time to provision the resources with not making users wait too long if something goes wrong. --- pkg/infrastructure/clusterapi/clusterapi.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pkg/infrastructure/clusterapi/clusterapi.go b/pkg/infrastructure/clusterapi/clusterapi.go index d478831f1b2..e0ddfe32764 100644 --- a/pkg/infrastructure/clusterapi/clusterapi.go +++ b/pkg/infrastructure/clusterapi/clusterapi.go @@ -40,6 +40,9 @@ import ( // interface the installer uses to call this provider. var _ infrastructure.Provider = (*InfraProvider)(nil) +// timeout for each provisioning step. +const timeout = 15 * time.Minute + // InfraProvider implements common Cluster API logic and // contains the platform CAPI provider, which is called // in the lifecycle defined by the Provider interface. @@ -159,6 +162,7 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset Duration: time.Second * 10, Factor: float64(1.5), Steps: 32, + Cap: timeout, }, func(ctx context.Context) (bool, error) { c := &clusterv1.Cluster{} if err := cl.Get(ctx, client.ObjectKey{ @@ -173,7 +177,7 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset cluster = c return cluster.Status.InfrastructureReady, nil }); err != nil { - return fileList, err + return fileList, fmt.Errorf("infrastructure was not ready within %v: %w", timeout, err) } if cluster == nil { return fileList, fmt.Errorf("error occurred during load balancer ready check") @@ -243,6 +247,7 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset Duration: time.Second * 10, Factor: float64(1.5), Steps: 32, + Cap: timeout, }, func(ctx context.Context) (bool, error) { for i := int64(0); i < masterCount; i++ { machine := &clusterv1.Machine{} @@ -266,7 +271,7 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset } return true, nil }); err != nil { - return fileList, err + return fileList, fmt.Errorf("machines were not provisioned within %v: %w", timeout, err) } } From 4e2c8f61dcf1d3fa835347569c5600c53f9ea06e Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Wed, 24 Apr 2024 14:43:00 -0400 Subject: [PATCH 2/2] Add timers to CAPI infrastructure provisioning Adds timers to each stage of CAPI infrastructure provisioning. These times will be logged at install complete, and can be used as a guide if we need to change the provisioning timeouts. --- pkg/infrastructure/clusterapi/clusterapi.go | 46 ++++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/pkg/infrastructure/clusterapi/clusterapi.go b/pkg/infrastructure/clusterapi/clusterapi.go index e0ddfe32764..23bbbb4717b 100644 --- a/pkg/infrastructure/clusterapi/clusterapi.go +++ b/pkg/infrastructure/clusterapi/clusterapi.go @@ -32,6 +32,7 @@ import ( "github.com/openshift/installer/pkg/asset/rhcos" "github.com/openshift/installer/pkg/clusterapi" "github.com/openshift/installer/pkg/infrastructure" + "github.com/openshift/installer/pkg/metrics/timer" "github.com/openshift/installer/pkg/types" ) @@ -40,8 +41,17 @@ import ( // interface the installer uses to call this provider. var _ infrastructure.Provider = (*InfraProvider)(nil) -// timeout for each provisioning step. -const timeout = 15 * time.Minute +const ( + // timeout for each provisioning step. + timeout = 15 * time.Minute + + preProvisionStage = "Infrastructure Pre-provisioning" + infrastructureStage = "Network-infrastructure Provisioning" + infrastructureReadyStage = "Post-network, pre-machine Provisioning" + ignitionStage = "Bootstrap Ignition Provisioning" + machineStage = "Machine Provisioning" + postProvisionStage = "Infrastructure Post-provisioning" +) // InfraProvider implements common Cluster API logic and // contains the platform CAPI provider, which is called @@ -110,15 +120,17 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset MachineManifests: machineManifests, WorkersAsset: workersAsset, } - + timer.StartTimer(preProvisionStage) if err := p.PreProvision(ctx, preProvisionInput); err != nil { return fileList, fmt.Errorf("failed during pre-provisioning: %w", err) } + timer.StopTimer(preProvisionStage) } else { logrus.Debugf("No pre-provisioning requirements for the %s provider", i.impl.Name()) } // Run the CAPI system. + timer.StartTimer(infrastructureStage) capiSystem := clusterapi.System() if err := capiSystem.Run(ctx, installConfig); err != nil { return fileList, fmt.Errorf("failed to run cluster api system: %w", err) @@ -156,6 +168,9 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset // Wait for successful provisioning by checking the InfrastructureReady // status on the cluster object. + untilTime := time.Now().Add(timeout) + timezone, _ := untilTime.Zone() + logrus.Infof("Waiting up to %v (until %v %s) for network infrastructure to become ready...", timeout, untilTime.Format(time.Kitchen), timezone) var cluster *clusterv1.Cluster { if err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{ @@ -177,7 +192,10 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset cluster = c return cluster.Status.InfrastructureReady, nil }); err != nil { - return fileList, fmt.Errorf("infrastructure was not ready within %v: %w", timeout, err) + if wait.Interrupted(err) { + return fileList, fmt.Errorf("infrastructure was not ready within %v: %w", timeout, err) + } + return fileList, fmt.Errorf("infrastructure is not ready: %w", err) } if cluster == nil { return fileList, fmt.Errorf("error occurred during load balancer ready check") @@ -186,6 +204,8 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset return fileList, fmt.Errorf("control plane endpoint is not set") } } + timer.StopTimer(infrastructureStage) + logrus.Info("Netork infrastructure is ready") if p, ok := i.impl.(InfraReadyProvider); ok { infraReadyInput := InfraReadyInput{ @@ -194,9 +214,11 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset InfraID: clusterID.InfraID, } + timer.StartTimer(infrastructureReadyStage) if err := p.InfraReady(ctx, infraReadyInput); err != nil { return fileList, fmt.Errorf("failed provisioning resources after infrastructure ready: %w", err) } + timer.StopTimer(infrastructureReadyStage) } else { logrus.Debugf("No infrastructure ready requirements for the %s provider", i.impl.Name()) } @@ -217,9 +239,11 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset TFVarsAsset: tfvarsAsset, } + timer.StartTimer(ignitionStage) if bootstrapIgnData, err = p.Ignition(ctx, ignInput); err != nil { return fileList, fmt.Errorf("failed preparing ignition data: %w", err) } + timer.StopTimer(ignitionStage) } else { logrus.Debugf("No Ignition requirements for the %s provider", i.impl.Name()) } @@ -227,6 +251,7 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset masterIgnSecret := IgnitionSecret(masterIgnAsset.Files()[0].Data, clusterID.InfraID, "master") machineManifests = append(machineManifests, bootstrapIgnSecret, masterIgnSecret) + timer.StartTimer(machineStage) // Create the machine manifests. for _, m := range machineManifests { m.SetNamespace(capiutils.Namespace) @@ -242,7 +267,9 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset masterCount = *reps } - logrus.Debugf("Waiting for machines to provision") + untilTime := time.Now().Add(timeout) + timezone, _ := untilTime.Zone() + logrus.Infof("Waiting up to %v (until %v %s) for machines to provision...", timeout, untilTime.Format(time.Kitchen), timezone) if err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{ Duration: time.Second * 10, Factor: float64(1.5), @@ -271,9 +298,14 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset } return true, nil }); err != nil { - return fileList, fmt.Errorf("machines were not provisioned within %v: %w", timeout, err) + if wait.Interrupted(err) { + return fileList, fmt.Errorf("control-plane machines were not provisioned within %v: %w", timeout, err) + } + return fileList, fmt.Errorf("control-plane machines are not ready: %w", err) } } + timer.StopTimer(machineStage) + logrus.Info("Control-plane machines are ready") if p, ok := i.impl.(PostProvider); ok { postMachineInput := PostProvisionInput{ @@ -282,9 +314,11 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset InfraID: clusterID.InfraID, } + timer.StartTimer(postProvisionStage) if err = p.PostProvision(ctx, postMachineInput); err != nil { return fileList, fmt.Errorf("failed during post-machine creation hook: %w", err) } + timer.StopTimer(postProvisionStage) } // For each manifest we created, retrieve it and store it in the asset.