Skip to content

Commit 7896654

Browse files
committed
Add maxRetry in job controller to prevent endless loop
1 parent 8dd5a34 commit 7896654

File tree

1 file changed

+18
-5
lines changed

1 file changed

+18
-5
lines changed

pkg/controllers/job/job_controller.go

+18-5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,15 @@ import (
5656
"volcano.sh/volcano/pkg/controllers/job/state"
5757
)
5858

59+
const (
60+
// maxRetries is the number of times a volcano job will be retried before it is dropped out of the queue.
61+
// With the current rate-limiter in use (5ms*2^(maxRetries-1)) the following numbers represent the times
62+
// a volcano job is going to be requeued:
63+
//
64+
// 5ms, 10ms, 20ms, 40ms, 80ms, 160ms, 320ms, 640ms, 1.3s, 2.6s, 5.1s, 10.2s, 20.4s, 41s, 82s
65+
maxRetries = 15
66+
)
67+
5968
// Controller the Job Controller type
6069
type Controller struct {
6170
kubeClients kubernetes.Interface
@@ -312,11 +321,15 @@ func (cc *Controller) processNextReq(count uint32) bool {
312321
}
313322

314323
if err := st.Execute(action); err != nil {
315-
glog.Errorf("Failed to handle Job <%s/%s>: %v",
316-
jobInfo.Job.Namespace, jobInfo.Job.Name, err)
317-
// If any error, requeue it.
318-
queue.AddRateLimited(req)
319-
return true
324+
if queue.NumRequeues(req) < maxRetries {
325+
glog.V(2).Infof("Failed to handle Job <%s/%s>: %v",
326+
jobInfo.Job.Namespace, jobInfo.Job.Name, err)
327+
// If any error, requeue it.
328+
queue.AddRateLimited(req)
329+
return true
330+
}
331+
332+
glog.V(2).Infof("Dropping job<%s/%s> out of the queue: %v", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
320333
}
321334

322335
// If no error, forget it.

0 commit comments

Comments
 (0)