Skip to content

Commit a15c276

Browse files
authored
Job controller implementation of backoff limit per index (kubernetes#118009)
1 parent f55f278 commit a15c276

File tree

9 files changed

+2345
-73
lines changed

9 files changed

+2345
-73
lines changed

pkg/controller/job/backoff_utils.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
v1 "k8s.io/api/core/v1"
2525
"k8s.io/client-go/tools/cache"
26+
"k8s.io/klog/v2"
2627
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
2728
"k8s.io/utils/clock"
2829
"k8s.io/utils/pointer"
@@ -213,20 +214,39 @@ func getFinishTimeFromDeletionTimestamp(p *v1.Pod) *time.Time {
213214
}
214215

215216
func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration) time.Duration {
216-
if backoff.failuresAfterLastSuccess == 0 {
217+
return getRemainingTimeForFailuresCount(clock, defaultBackoff, maxBackoff, backoff.failuresAfterLastSuccess, backoff.lastFailureTime)
218+
}
219+
220+
// getRemainingTimePerIndex returns the remaining time left for a given index to
221+
// create the replacement pods. The number of consecutive pod failures for the
222+
// index is retrieved from the `job-index-failure-count` annotation of the
223+
// last failed pod within the index (represented by `lastFailedPod`).
224+
// The last failed pod is also used to determine the time of the last failure.
225+
func getRemainingTimePerIndex(logger klog.Logger, clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration, lastFailedPod *v1.Pod) time.Duration {
226+
if lastFailedPod == nil {
227+
// There is no previous failed pod for this index
228+
return time.Duration(0)
229+
}
230+
failureCount := getIndexAbsoluteFailureCount(logger, lastFailedPod) + 1
231+
lastFailureTime := getFinishedTime(lastFailedPod)
232+
return getRemainingTimeForFailuresCount(clock, defaultBackoff, maxBackoff, failureCount, &lastFailureTime)
233+
}
234+
235+
func getRemainingTimeForFailuresCount(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration, failuresCount int32, lastFailureTime *time.Time) time.Duration {
236+
if failuresCount == 0 {
217237
return 0
218238
}
219239

220240
backoffDuration := defaultBackoff
221-
for i := 1; i < int(backoff.failuresAfterLastSuccess); i++ {
241+
for i := 1; i < int(failuresCount); i++ {
222242
backoffDuration = backoffDuration * 2
223243
if backoffDuration >= maxBackoff {
224244
backoffDuration = maxBackoff
225245
break
226246
}
227247
}
228248

229-
timeElapsedSinceLastFailure := clock.Since(*backoff.lastFailureTime)
249+
timeElapsedSinceLastFailure := clock.Since(*lastFailureTime)
230250

231251
if backoffDuration < timeElapsedSinceLastFailure {
232252
return 0

pkg/controller/job/backoff_utils_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/google/go-cmp/cmp"
2424
v1 "k8s.io/api/core/v1"
2525
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/klog/v2/ktesting"
2627
clocktesting "k8s.io/utils/clock/testing"
2728
"k8s.io/utils/pointer"
2829
)
@@ -466,3 +467,46 @@ func TestGetRemainingBackoffTime(t *testing.T) {
466467
})
467468
}
468469
}
470+
471+
func TestGetRemainingBackoffTimePerIndex(t *testing.T) {
472+
defaultTestTime := metav1.NewTime(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC))
473+
testCases := map[string]struct {
474+
currentTime time.Time
475+
maxBackoff time.Duration
476+
defaultBackoff time.Duration
477+
lastFailedPod *v1.Pod
478+
wantDuration time.Duration
479+
}{
480+
"no failures": {
481+
lastFailedPod: nil,
482+
defaultBackoff: 5 * time.Second,
483+
maxBackoff: 700 * time.Second,
484+
wantDuration: 0 * time.Second,
485+
},
486+
"two prev failures; current time and failure time are same": {
487+
lastFailedPod: buildPod().phase(v1.PodFailed).indexFailureCount("2").customDeletionTimestamp(defaultTestTime.Time).Pod,
488+
currentTime: defaultTestTime.Time,
489+
defaultBackoff: 5 * time.Second,
490+
maxBackoff: 700 * time.Second,
491+
wantDuration: 20 * time.Second,
492+
},
493+
"one prev failure counted and one ignored; current time and failure time are same": {
494+
lastFailedPod: buildPod().phase(v1.PodFailed).indexFailureCount("1").indexIgnoredFailureCount("1").customDeletionTimestamp(defaultTestTime.Time).Pod,
495+
currentTime: defaultTestTime.Time,
496+
defaultBackoff: 5 * time.Second,
497+
maxBackoff: 700 * time.Second,
498+
wantDuration: 20 * time.Second,
499+
},
500+
}
501+
502+
for name, tc := range testCases {
503+
t.Run(name, func(t *testing.T) {
504+
logger, _ := ktesting.NewTestContext(t)
505+
fakeClock := clocktesting.NewFakeClock(tc.currentTime.Truncate(time.Second))
506+
d := getRemainingTimePerIndex(logger, fakeClock, tc.defaultBackoff, tc.maxBackoff, tc.lastFailedPod)
507+
if d.Seconds() != tc.wantDuration.Seconds() {
508+
t.Errorf("Expected value of duration %v; got %v", tc.wantDuration, d)
509+
}
510+
})
511+
}
512+
}

0 commit comments

Comments
 (0)