@@ -466,7 +466,7 @@ func (qjm *XController) PreemptQueueJobs() {
466466 // Only back-off AWs that are in state running and not in state Failed
467467 if updateNewJob .Status .State != arbv1 .AppWrapperStateFailed {
468468 klog .Infof ("[PreemptQueueJobs] Adding preempted AppWrapper %s/%s to back off queue." , aw .Name , aw .Namespace )
469- go qjm .backoff (ctx , updateNewJob , "PreemptionTriggered" , string (message ))
469+ qjm .backoff (ctx , updateNewJob , "PreemptionTriggered" , string (message ))
470470 }
471471 }
472472 }
@@ -1155,7 +1155,7 @@ func (qjm *XController) ScheduleNext(qj *arbv1.AppWrapper) {
11551155 } else {
11561156 dispatchFailedMessage = "Cannot find an cluster with enough resources to dispatch AppWrapper."
11571157 klog .V (2 ).Infof ("[ScheduleNex] [Dispatcher Mode] %s %s\n " , dispatchFailedReason , dispatchFailedMessage )
1158- go qjm .backoff (ctx , qj , dispatchFailedReason , dispatchFailedMessage )
1158+ qjm .backoff (ctx , qj , dispatchFailedReason , dispatchFailedMessage )
11591159 }
11601160 } else { // Agent Mode
11611161 aggqj := qjm .GetAggregatedResources (qj )
@@ -1284,7 +1284,7 @@ func (qjm *XController) ScheduleNext(qj *arbv1.AppWrapper) {
12841284 // TODO: Remove forwarded logic as a big AW will never be forwarded
12851285 forwarded = true
12861286 // should we call backoff or update etcd?
1287- go qjm .backoff (ctx , qj , dispatchFailedReason , dispatchFailedMessage )
1287+ qjm .backoff (ctx , qj , dispatchFailedReason , dispatchFailedMessage )
12881288 }
12891289 }
12901290 forwarded = true
@@ -1347,7 +1347,7 @@ func (qjm *XController) ScheduleNext(qj *arbv1.AppWrapper) {
13471347 if qjm .quotaManager != nil && quotaFits {
13481348 qjm .quotaManager .Release (qj )
13491349 }
1350- go qjm .backoff (ctx , qj , dispatchFailedReason , dispatchFailedMessage )
1350+ qjm .backoff (ctx , qj , dispatchFailedReason , dispatchFailedMessage )
13511351 }
13521352 }
13531353 return nil
@@ -1672,6 +1672,20 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) {
16721672 }
16731673
16741674 klog .V (6 ).Infof ("[Informer-updateQJ] '%s/%s' *Delay=%.6f seconds normal enqueue Version=%s Status=%v" , newQJ .Namespace , newQJ .Name , time .Now ().Sub (newQJ .Status .ControllerFirstTimestamp .Time ).Seconds (), newQJ .ResourceVersion , newQJ .Status )
1675+ for _ , cond := range newQJ .Status .Conditions {
1676+ if cond .Type == arbv1 .AppWrapperCondBackoff {
1677+ //AWs that have backoff conditions have a delay of 10 seconds before getting added to enqueue.
1678+ //TODO: we could plug an interface here with back-off strategies for different MCAD use cases.
1679+ time .AfterFunc (time .Duration (cc .serverOption .BackoffTime )* time .Second , func () {
1680+ if cc .serverOption .QuotaEnabled && cc .quotaManager != nil {
1681+ cc .quotaManager .Release (newQJ )
1682+ }
1683+ cc .enqueue (newQJ )
1684+ })
1685+ return
1686+ }
1687+ }
1688+
16751689 // cc.eventQueue.Delete(oldObj)
16761690 cc .enqueue (newQJ )
16771691}
0 commit comments