From e064704d16467581f0bb19dd6b14e4ee2acb7b68 Mon Sep 17 00:00:00 2001 From: Evan Phoenix Date: Thu, 30 Sep 2021 16:13:06 -0700 Subject: [PATCH 1/3] Fix a few minor kubernetes f-k8-papercuts 1. This scales the wait time back so that bad deploys fail much faster. Previously we had it hardcoded to 10 minutes, now it's dynamic. 2. When a deployment fails, print out non Info events from the pods to help the user diagonise what happened. 3. Provide some small grace period on pod cleanup to allow for the potential of debugging the failed pods directly. 4. Handle not being able to search for metric pods 5. Handle destroying an old deployment that didn't have autoscaling defined yet. 6. Setup the kubernetes service with the workspace name if the workspace is not the default workspace --- builtin/k8s/platform.go | 57 ++++++++++++++++++++++++++++++++++++----- builtin/k8s/releaser.go | 8 +++++- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/builtin/k8s/platform.go b/builtin/k8s/platform.go index aba8e886a16..99cfaf746bc 100644 --- a/builtin/k8s/platform.go +++ b/builtin/k8s/platform.go @@ -19,6 +19,7 @@ import ( "k8s.io/apimachinery/pkg/api/errors" k8sresource "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/wait" _ "k8s.io/client-go/plugin/pkg/client/auth" @@ -580,6 +581,7 @@ func (p *Platform) resourceDeploymentCreate( }, InitialDelaySeconds: initialDelaySeconds, TimeoutSeconds: timeoutSeconds, + FailureThreshold: failureThreshold, } } @@ -681,6 +683,8 @@ func (p *Platform) resourceDeploymentCreate( return err } + ev := clientSet.CoreV1().Events(ns) + // We successfully created or updated, so set the name on our state so // that if we error, we'll partially clean up properly. THIS IS IMPORTANT. state.Name = result.Name @@ -698,10 +702,15 @@ func (p *Platform) resourceDeploymentCreate( reportedError bool ) - timeout := 10 * time.Minute + // We wait the maximum amount of time that the deployment controller would wait for a pod + // to start before exitting. We double the time to allow for various kubernetes based + // delays in startup, detection, and reporting. + timeout := time.Duration((timeoutSeconds*failureThreshold)+initialDelaySeconds) * 2 * time.Second + + podsSeen := make(map[types.UID]string) // Wait on the Pod to start - err = wait.PollImmediate(2*time.Second, timeout, func() (bool, error) { + err = wait.PollImmediate(time.Second, timeout, func() (bool, error) { dep, err := dc.Get(ctx, result.Name, metav1.GetOptions{}) if err != nil { return false, err @@ -709,9 +718,9 @@ func (p *Platform) resourceDeploymentCreate( if time.Since(lastStatus) > 10*time.Second { step.Update(fmt.Sprintf( - "Waiting on deployment to become available: %d/%d/%d", + "Waiting on deployment to become available: requested=%d running=%d ready=%d", *dep.Spec.Replicas, - dep.Status.UnavailableReplicas, + dep.Status.UnavailableReplicas+dep.Status.AvailableReplicas, dep.Status.AvailableReplicas, )) lastStatus = time.Now() @@ -730,6 +739,8 @@ func (p *Platform) resourceDeploymentCreate( } for _, p := range pods.Items { + podsSeen[p.UID] = p.Name + for _, cs := range p.Status.ContainerStatuses { if cs.Ready { continue @@ -760,6 +771,28 @@ func (p *Platform) resourceDeploymentCreate( return false, nil }) if err != nil { + step.Update("Error detected waiting for Deployment to start.") + step.Abort() + + ui.Output("The following is events for pods observed while attempting to start the Deployment", terminal.WithWarningStyle()) + + for uid, name := range podsSeen { + sel := ev.GetFieldSelector(nil, nil, nil, (*string)(&uid)) + + events, err := ev.List(ctx, metav1.ListOptions{ + FieldSelector: sel.String(), + }) + if err == nil { + ui.Output("Events for %s", name, terminal.WithHeaderStyle()) + for _, ev := range events.Items { + if ev.Type == "Normal" { + continue + } + ui.Output(" %s: %s (%s)", ev.Type, ev.Message, ev.Reason) + } + } + } + if err == wait.ErrWaitTimeout { err = fmt.Errorf("Deployment was not able to start pods after %s", timeout) } @@ -772,6 +805,8 @@ func (p *Platform) resourceDeploymentCreate( return nil } +var deleteGrace = int64(120) + // Destroy deletes the K8S deployment. func (p *Platform) resourceDeploymentDestroy( ctx context.Context, @@ -791,8 +826,14 @@ func (p *Platform) resourceDeploymentDestroy( step.Done() step = sg.Add("Deleting deployment...") + + del := metav1.DeletePropagationBackground + deployclient := csinfo.Clientset.AppsV1().Deployments(ns) - if err := deployclient.Delete(ctx, state.Name, metav1.DeleteOptions{}); err != nil { + if err := deployclient.Delete(ctx, state.Name, metav1.DeleteOptions{ + GracePeriodSeconds: &deleteGrace, + PropagationPolicy: &del, + }); err != nil { return err } @@ -843,6 +884,10 @@ func (p *Platform) resourceAutoscalerCreate( // we don't return the error, this was mostly to provide a helpful warning log.Info("receieved error while listing pods in attempt to detect existing metrics-server: %s", err) err = nil + + // The apis return an non-nil but empty value when observing an error, so we need to be sure to + // skip the code below. + metricsPods = nil } if metricsPods != nil && len(metricsPods.Items) == 0 { @@ -929,7 +974,7 @@ func (p *Platform) resourceAutoscalerDestroy( sg terminal.StepGroup, csinfo *clientsetInfo, ) error { - if p.config.AutoscaleConfig == nil && state.Name == "" { + if state.Name == "" { // No autoscale config, so don't destroy one return nil } diff --git a/builtin/k8s/releaser.go b/builtin/k8s/releaser.go index 44da89ec090..51f83218a1d 100644 --- a/builtin/k8s/releaser.go +++ b/builtin/k8s/releaser.go @@ -724,12 +724,18 @@ func (r *Releaser) Release( ctx context.Context, log hclog.Logger, src *component.Source, + job *component.JobInfo, ui terminal.UI, target *Deployment, dcr *component.DeclaredResourcesResp, ) (*Release, error) { var result Release - result.ServiceName = src.App + + if job.Workspace == "default" { + result.ServiceName = src.App + } else { + result.ServiceName = src.App + "-" + job.Workspace + } sg := ui.StepGroup() defer sg.Wait() From 118cd58dfc075c85457026d401de459353b519c9 Mon Sep 17 00:00:00 2001 From: Evan Phoenix Date: Thu, 30 Sep 2021 16:22:37 -0700 Subject: [PATCH 2/3] Add changelog entry --- .changelog/2399.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .changelog/2399.txt diff --git a/.changelog/2399.txt b/.changelog/2399.txt new file mode 100644 index 00000000000..1c09b4a1991 --- /dev/null +++ b/.changelog/2399.txt @@ -0,0 +1,7 @@ +```release-note:feature +plugin/k8: Report events on failed pods when a deployment fails +``` + +```release-note:bugfix +plugin/k8: Setup Kubernetes services for different workspaces properly +``` From c0a556d59d3f3672ed8a94ca5fc04cddb86a0a8a Mon Sep 17 00:00:00 2001 From: Evan Phoenix Date: Fri, 1 Oct 2021 09:17:27 -0700 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Brian Cain --- .changelog/2399.txt | 2 +- builtin/k8s/platform.go | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.changelog/2399.txt b/.changelog/2399.txt index 1c09b4a1991..16e069a2bac 100644 --- a/.changelog/2399.txt +++ b/.changelog/2399.txt @@ -2,6 +2,6 @@ plugin/k8: Report events on failed pods when a deployment fails ``` -```release-note:bugfix +```release-note:bug plugin/k8: Setup Kubernetes services for different workspaces properly ``` diff --git a/builtin/k8s/platform.go b/builtin/k8s/platform.go index 99cfaf746bc..f14cd55f917 100644 --- a/builtin/k8s/platform.go +++ b/builtin/k8s/platform.go @@ -703,7 +703,7 @@ func (p *Platform) resourceDeploymentCreate( ) // We wait the maximum amount of time that the deployment controller would wait for a pod - // to start before exitting. We double the time to allow for various kubernetes based + // to start before exiting. We double the time to allow for various Kubernetes based // delays in startup, detection, and reporting. timeout := time.Duration((timeoutSeconds*failureThreshold)+initialDelaySeconds) * 2 * time.Second @@ -772,6 +772,7 @@ func (p *Platform) resourceDeploymentCreate( }) if err != nil { step.Update("Error detected waiting for Deployment to start.") + step.Status(terminal.StatusError) step.Abort() ui.Output("The following is events for pods observed while attempting to start the Deployment", terminal.WithWarningStyle())