Skip to content

Commit d6033eb

Browse files
Merge pull request #412 from hzxuzhonghu/max-retry
Add maxRetry in job controller to prevent endless loop
2 parents 2b7fbe2 + 490ba7b commit d6033eb

File tree

1 file changed

+19
-5
lines changed

1 file changed

+19
-5
lines changed

pkg/controllers/job/job_controller.go

+19-5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,15 @@ import (
5656
"volcano.sh/volcano/pkg/controllers/job/state"
5757
)
5858

59+
const (
60+
// maxRetries is the number of times a volcano job will be retried before it is dropped out of the queue.
61+
// With the current rate-limiter in use (5ms*2^(maxRetries-1)) the following numbers represent the times
62+
// a volcano job is going to be requeued:
63+
//
64+
// 5ms, 10ms, 20ms, 40ms, 80ms, 160ms, 320ms, 640ms, 1.3s, 2.6s, 5.1s, 10.2s, 20.4s, 41s, 82s
65+
maxRetries = 15
66+
)
67+
5968
// Controller the Job Controller type
6069
type Controller struct {
6170
kubeClients kubernetes.Interface
@@ -311,11 +320,16 @@ func (cc *Controller) processNextReq(count uint32) bool {
311320
}
312321

313322
if err := st.Execute(action); err != nil {
314-
glog.Errorf("Failed to handle Job <%s/%s>: %v",
315-
jobInfo.Job.Namespace, jobInfo.Job.Name, err)
316-
// If any error, requeue it.
317-
queue.AddRateLimited(req)
318-
return true
323+
if queue.NumRequeues(req) < maxRetries {
324+
glog.V(2).Infof("Failed to handle Job <%s/%s>: %v",
325+
jobInfo.Job.Namespace, jobInfo.Job.Name, err)
326+
// If any error, requeue it.
327+
queue.AddRateLimited(req)
328+
return true
329+
}
330+
cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, vkbatchv1.ExecuteAction, fmt.Sprintf(
331+
"Job failed on action %s for retry limit reached", action))
332+
glog.Warningf("Dropping job<%s/%s> out of the queue: %v because max retries has reached", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
319333
}
320334

321335
// If no error, forget it.

0 commit comments

Comments
 (0)