Skip to content
This repository has been archived by the owner on Jan 8, 2024. It is now read-only.

Fix a few minor kubernetes papercuts #2399

Merged
merged 3 commits into from
Oct 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changelog/2399.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:feature
plugin/k8: Report events on failed pods when a deployment fails
```

```release-note:bug
plugin/k8: Setup Kubernetes services for different workspaces properly
```
58 changes: 52 additions & 6 deletions builtin/k8s/platform.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
k8sresource "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/wait"
_ "k8s.io/client-go/plugin/pkg/client/auth"
Expand Down Expand Up @@ -580,6 +581,7 @@ func (p *Platform) resourceDeploymentCreate(
},
InitialDelaySeconds: initialDelaySeconds,
TimeoutSeconds: timeoutSeconds,
FailureThreshold: failureThreshold,
}
}

Expand Down Expand Up @@ -681,6 +683,8 @@ func (p *Platform) resourceDeploymentCreate(
return err
}

ev := clientSet.CoreV1().Events(ns)

// We successfully created or updated, so set the name on our state so
// that if we error, we'll partially clean up properly. THIS IS IMPORTANT.
state.Name = result.Name
Expand All @@ -698,20 +702,25 @@ func (p *Platform) resourceDeploymentCreate(
reportedError bool
)

timeout := 10 * time.Minute
// We wait the maximum amount of time that the deployment controller would wait for a pod
// to start before exiting. We double the time to allow for various Kubernetes based
// delays in startup, detection, and reporting.
timeout := time.Duration((timeoutSeconds*failureThreshold)+initialDelaySeconds) * 2 * time.Second

podsSeen := make(map[types.UID]string)

// Wait on the Pod to start
err = wait.PollImmediate(2*time.Second, timeout, func() (bool, error) {
err = wait.PollImmediate(time.Second, timeout, func() (bool, error) {
dep, err := dc.Get(ctx, result.Name, metav1.GetOptions{})
if err != nil {
return false, err
}

if time.Since(lastStatus) > 10*time.Second {
step.Update(fmt.Sprintf(
"Waiting on deployment to become available: %d/%d/%d",
"Waiting on deployment to become available: requested=%d running=%d ready=%d",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed this change in your demo and I like it! 👍🏻 ✨

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bitmoji

*dep.Spec.Replicas,
dep.Status.UnavailableReplicas,
dep.Status.UnavailableReplicas+dep.Status.AvailableReplicas,
dep.Status.AvailableReplicas,
))
lastStatus = time.Now()
Expand All @@ -730,6 +739,8 @@ func (p *Platform) resourceDeploymentCreate(
}

for _, p := range pods.Items {
podsSeen[p.UID] = p.Name

for _, cs := range p.Status.ContainerStatuses {
if cs.Ready {
continue
Expand Down Expand Up @@ -760,6 +771,29 @@ func (p *Platform) resourceDeploymentCreate(
return false, nil
})
if err != nil {
step.Update("Error detected waiting for Deployment to start.")
evanphx marked this conversation as resolved.
Show resolved Hide resolved
step.Status(terminal.StatusError)
step.Abort()

ui.Output("The following is events for pods observed while attempting to start the Deployment", terminal.WithWarningStyle())

for uid, name := range podsSeen {
sel := ev.GetFieldSelector(nil, nil, nil, (*string)(&uid))

events, err := ev.List(ctx, metav1.ListOptions{
FieldSelector: sel.String(),
})
if err == nil {
ui.Output("Events for %s", name, terminal.WithHeaderStyle())
for _, ev := range events.Items {
if ev.Type == "Normal" {
continue
}
ui.Output(" %s: %s (%s)", ev.Type, ev.Message, ev.Reason)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also noticed this change in your demo, excited for this! 😄

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bitmoji

}
}
}

if err == wait.ErrWaitTimeout {
err = fmt.Errorf("Deployment was not able to start pods after %s", timeout)
}
Expand All @@ -772,6 +806,8 @@ func (p *Platform) resourceDeploymentCreate(
return nil
}

var deleteGrace = int64(120)

// Destroy deletes the K8S deployment.
func (p *Platform) resourceDeploymentDestroy(
ctx context.Context,
Expand All @@ -791,8 +827,14 @@ func (p *Platform) resourceDeploymentDestroy(
step.Done()

step = sg.Add("Deleting deployment...")

del := metav1.DeletePropagationBackground

deployclient := csinfo.Clientset.AppsV1().Deployments(ns)
if err := deployclient.Delete(ctx, state.Name, metav1.DeleteOptions{}); err != nil {
if err := deployclient.Delete(ctx, state.Name, metav1.DeleteOptions{
GracePeriodSeconds: &deleteGrace,
PropagationPolicy: &del,
}); err != nil {
return err
}

Expand Down Expand Up @@ -843,6 +885,10 @@ func (p *Platform) resourceAutoscalerCreate(
// we don't return the error, this was mostly to provide a helpful warning
log.Info("receieved error while listing pods in attempt to detect existing metrics-server: %s", err)
err = nil

// The apis return an non-nil but empty value when observing an error, so we need to be sure to
// skip the code below.
metricsPods = nil
}

if metricsPods != nil && len(metricsPods.Items) == 0 {
Expand Down Expand Up @@ -929,7 +975,7 @@ func (p *Platform) resourceAutoscalerDestroy(
sg terminal.StepGroup,
csinfo *clientsetInfo,
) error {
if p.config.AutoscaleConfig == nil && state.Name == "" {
if state.Name == "" {
// No autoscale config, so don't destroy one
return nil
}
Expand Down
8 changes: 7 additions & 1 deletion builtin/k8s/releaser.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,12 +724,18 @@ func (r *Releaser) Release(
ctx context.Context,
log hclog.Logger,
src *component.Source,
job *component.JobInfo,
ui terminal.UI,
target *Deployment,
dcr *component.DeclaredResourcesResp,
) (*Release, error) {
var result Release
result.ServiceName = src.App

if job.Workspace == "default" {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should throw this in the labels too for the label selector.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what ya mean....

result.ServiceName = src.App
} else {
result.ServiceName = src.App + "-" + job.Workspace
}

sg := ui.StepGroup()
defer sg.Wait()
Expand Down