@@ -56,6 +56,15 @@ import (
56
56
"volcano.sh/volcano/pkg/controllers/job/state"
57
57
)
58
58
59
+ const (
60
+ // maxRetries is the number of times a volcano job will be retried before it is dropped out of the queue.
61
+ // With the current rate-limiter in use (5ms*2^(maxRetries-1)) the following numbers represent the times
62
+ // a volcano job is going to be requeued:
63
+ //
64
+ // 5ms, 10ms, 20ms, 40ms, 80ms, 160ms, 320ms, 640ms, 1.3s, 2.6s, 5.1s, 10.2s, 20.4s, 41s, 82s
65
+ maxRetries = 15
66
+ )
67
+
59
68
// Controller the Job Controller type
60
69
type Controller struct {
61
70
kubeClients kubernetes.Interface
@@ -311,11 +320,16 @@ func (cc *Controller) processNextReq(count uint32) bool {
311
320
}
312
321
313
322
if err := st .Execute (action ); err != nil {
314
- glog .Errorf ("Failed to handle Job <%s/%s>: %v" ,
315
- jobInfo .Job .Namespace , jobInfo .Job .Name , err )
316
- // If any error, requeue it.
317
- queue .AddRateLimited (req )
318
- return true
323
+ if queue .NumRequeues (req ) < maxRetries {
324
+ glog .V (2 ).Infof ("Failed to handle Job <%s/%s>: %v" ,
325
+ jobInfo .Job .Namespace , jobInfo .Job .Name , err )
326
+ // If any error, requeue it.
327
+ queue .AddRateLimited (req )
328
+ return true
329
+ }
330
+ cc .recordJobEvent (jobInfo .Job .Namespace , jobInfo .Job .Name , vkbatchv1 .ExecuteAction , fmt .Sprintf (
331
+ "Job failed on action %s for retry limit reached" , action ))
332
+ glog .Warningf ("Dropping job<%s/%s> out of the queue: %v because max retries has reached" , jobInfo .Job .Namespace , jobInfo .Job .Name , err )
319
333
}
320
334
321
335
// If no error, forget it.
0 commit comments