Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop producing pod unschedulable errors #3864

Merged
merged 8 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 0 additions & 40 deletions internal/executor/reporter/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,46 +132,6 @@ func getPodNumber(pod *v1.Pod) int32 {
return int32(podNumber)
}

func CreateJobUnableToScheduleEvent(pod *v1.Pod, reason string, clusterId string) (*armadaevents.EventSequence, error) {
sequence := createEmptySequence(pod)
jobId, runId, err := extractIds(pod)
if err != nil {
return nil, err
}

sequence.Events = append(sequence.Events, &armadaevents.EventSequence_Event{
Created: types.TimestampNow(),
Event: &armadaevents.EventSequence_Event_JobRunErrors{
JobRunErrors: &armadaevents.JobRunErrors{
RunId: runId,
RunIdStr: armadaevents.MustUuidStringFromProtoUuid(runId),
JobId: jobId,
JobIdStr: armadaevents.MustUlidStringFromProtoUuid(jobId),
Errors: []*armadaevents.Error{
{
Terminal: false, // EventMessage_UnableToSchedule indicates an issue with job to start up - info only
Reason: &armadaevents.Error_PodUnschedulable{
PodUnschedulable: &armadaevents.PodUnschedulable{
ObjectMeta: &armadaevents.ObjectMeta{
KubernetesId: string(pod.ObjectMeta.UID),
Name: pod.Name,
Namespace: pod.Namespace,
ExecutorId: clusterId,
},
Message: reason,
NodeName: pod.Spec.NodeName,
PodNumber: getPodNumber(pod),
},
},
},
},
},
},
})

return sequence, nil
}

func CreateJobIngressInfoEvent(pod *v1.Pod, clusterId string, associatedServices []*v1.Service, associatedIngresses []*networking.Ingress) (*armadaevents.EventSequence, error) {
if pod.Spec.NodeName == "" || pod.Status.HostIP == "" {
return nil, errors.Errorf("unable to create JobIngressInfoEvent for pod %s (%s), as pod is not allocated to a node", pod.Name, pod.Namespace)
Expand Down
32 changes: 1 addition & 31 deletions internal/executor/service/pod_issue_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,24 +350,12 @@ func (p *IssueHandler) handleNonRetryableJobIssue(issue *issue) {
log.Infof("Handling non-retryable issue detected for job %s run %s", issue.RunIssue.JobId, issue.RunIssue.RunId)
podIssue := issue.RunIssue.PodIssue

events := make([]reporter.EventMessage, 0, 2)
if podIssue.Type == StuckStartingUp || podIssue.Type == UnableToSchedule {
unableToScheduleEvent, err := reporter.CreateJobUnableToScheduleEvent(podIssue.OriginalPodState, podIssue.Message, p.clusterContext.GetClusterId())
if err != nil {
log.Errorf("Failed to create unable to schedule event for job %s because %s", issue.RunIssue.JobId, err)
return
}
events = append(events, reporter.EventMessage{Event: unableToScheduleEvent, JobRunId: issue.RunIssue.RunId})
}
failedEvent, err := reporter.CreateSimpleJobFailedEvent(podIssue.OriginalPodState, podIssue.Message, podIssue.DebugMessage, p.clusterContext.GetClusterId(), podIssue.Cause)
if err != nil {
log.Errorf("Failed to create failed event for job %s because %s", issue.RunIssue.JobId, err)
return
}

events = append(events, reporter.EventMessage{Event: failedEvent, JobRunId: issue.RunIssue.RunId})

err = p.eventReporter.Report(events)
err = p.eventReporter.Report([]reporter.EventMessage{{Event: failedEvent, JobRunId: issue.RunIssue.RunId}})
if err != nil {
log.Errorf("Failed to report failed event for job %s because %s", issue.RunIssue.JobId, err)
return
Expand All @@ -384,28 +372,10 @@ func (p *IssueHandler) handleNonRetryableJobIssue(issue *issue) {
}

// For retryable issues we must:
// - Report JobUnableToScheduleEvent
// - Report JobReturnLeaseEvent
//
// If the pod becomes Running/Completed/Failed in the middle of being deleted - swap this issue to a nonRetryableIssue where it will be Failed
func (p *IssueHandler) handleRetryableJobIssue(issue *issue) {
if !issue.RunIssue.Reported {
log.Infof("Handling retryable issue for job %s run %s", issue.RunIssue.JobId, issue.RunIssue.RunId)
if issue.RunIssue.PodIssue.Type == StuckStartingUp || issue.RunIssue.PodIssue.Type == UnableToSchedule {
event, err := reporter.CreateJobUnableToScheduleEvent(issue.RunIssue.PodIssue.OriginalPodState, issue.RunIssue.PodIssue.Message, p.clusterContext.GetClusterId())
if err != nil {
log.Errorf("Failed to create unable to schedule event for job %s because %s", issue.RunIssue.JobId, err)
return
}
err = p.eventReporter.Report([]reporter.EventMessage{{Event: event, JobRunId: issue.RunIssue.RunId}})
if err != nil {
log.Errorf("Failure to report stuck pod event %+v because %s", event, err)
return
}
}
p.markIssueReported(issue.RunIssue)
}

if issue.CurrentPodState != nil {
if issue.CurrentPodState.Status.Phase != v1.PodPending {
p.markIssuesResolved(issue.RunIssue)
Expand Down
26 changes: 2 additions & 24 deletions internal/executor/service/pod_issue_handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,9 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckAndUnretryable(t *tes
remainingActivePods := getActivePods(t, fakeClusterContext)
assert.Equal(t, []*v1.Pod{}, remainingActivePods)

assert.Len(t, eventsReporter.ReceivedEvents, 2)
assert.Len(t, eventsReporter.ReceivedEvents, 1)
assert.Len(t, eventsReporter.ReceivedEvents[0].Event.Events, 1)
unableToScheduleEvent, ok := eventsReporter.ReceivedEvents[0].Event.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors)
assert.True(t, ok)
assert.Len(t, unableToScheduleEvent.JobRunErrors.Errors, 1)
assert.True(t, unableToScheduleEvent.JobRunErrors.Errors[0].GetPodUnschedulable() != nil)

assert.Len(t, eventsReporter.ReceivedEvents[1].Event.Events, 1)
failedEvent, ok := eventsReporter.ReceivedEvents[1].Event.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors)
failedEvent, ok := eventsReporter.ReceivedEvents[0].Event.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors)
assert.True(t, ok)
assert.Len(t, failedEvent.JobRunErrors.Errors, 1)
assert.Contains(t, failedEvent.JobRunErrors.Errors[0].GetPodError().Message, "unrecoverable problem")
Expand Down Expand Up @@ -157,14 +151,6 @@ func TestPodIssueService_DeletesPodAndReportsLeaseReturned_IfRetryableStuckPod(t
remainingActivePods := getActivePods(t, fakeClusterContext)
assert.Equal(t, []*v1.Pod{}, remainingActivePods)

// Reports UnableToSchedule
assert.Len(t, eventsReporter.ReceivedEvents, 1)
assert.Len(t, eventsReporter.ReceivedEvents[0].Event.Events, 1)
unableToScheduleEvent, ok := eventsReporter.ReceivedEvents[0].Event.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors)
assert.True(t, ok)
assert.Len(t, unableToScheduleEvent.JobRunErrors.Errors, 1)
assert.True(t, unableToScheduleEvent.JobRunErrors.Errors[0].GetPodUnschedulable() != nil)

// Reset events
eventsReporter.ReceivedEvents = []reporter.EventMessage{}
podIssueService.HandlePodIssues()
Expand All @@ -185,14 +171,6 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfRetryableStuckPodStartsUpA

podIssueService.HandlePodIssues()

// Reports UnableToSchedule
assert.Len(t, eventsReporter.ReceivedEvents, 1)
assert.Len(t, eventsReporter.ReceivedEvents[0].Event.Events, 1)
unableToScheduleEvent, ok := eventsReporter.ReceivedEvents[0].Event.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors)
assert.True(t, ok)
assert.Len(t, unableToScheduleEvent.JobRunErrors.Errors, 1)
assert.True(t, unableToScheduleEvent.JobRunErrors.Errors[0].GetPodUnschedulable() != nil)

// Reset events, and add pod back as running
eventsReporter.ReceivedEvents = []reporter.EventMessage{}
retryableStuckPod.Status.Phase = v1.PodRunning
Expand Down
1 change: 0 additions & 1 deletion testsuite/testcases/basic/failure_errimagepull.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,4 @@ jobs:
timeout: "300s"
expectedEvents:
- submitted:
- unableToSchedule:
- failed:
Loading