Skip to content

Commit 9cd5918

Browse files
committed
[ci] Add retries to docker push
This should mitigate failures like in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4274/pipeline. This also moves the `retry` function to a script now that we have PR #12604.
1 parent a408493 commit 9cd5918

File tree

8 files changed

+223
-1443
lines changed

8 files changed

+223
-1443
lines changed

Jenkinsfile

Lines changed: 157 additions & 1398 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ci/jenkins/Build.groovy.j2

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ stage('Build') {
8484
if (!skip_ci) {
8585
node('CPU-SMALL') {
8686
ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
87-
docker_init(ci_gpu)
8887
init_git()
88+
docker_init(ci_gpu)
8989
sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
9090
make("${ci_gpu} --no-gpu", 'build', '-j2')
9191
{{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
@@ -102,8 +102,8 @@ stage('Build') {
102102
if (!skip_ci && is_docs_only_build != 1) {
103103
node('CPU-SMALL') {
104104
ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
105-
docker_init(ci_cpu)
106105
init_git()
106+
docker_init(ci_cpu)
107107
sh (
108108
script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
109109
label: 'Create CPU cmake config',
@@ -126,8 +126,8 @@ stage('Build') {
126126
if (!skip_ci && is_docs_only_build != 1) {
127127
node('CPU-SMALL') {
128128
ws({{ m.per_exec_ws('tvm/build-cpu-minimal') }}) {
129-
docker_init(ci_minimal)
130129
init_git()
130+
docker_init(ci_minimal)
131131
sh (
132132
script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
133133
label: 'Create CPU minimal cmake config',
@@ -144,8 +144,8 @@ stage('Build') {
144144
if (!skip_ci && is_docs_only_build != 1) {
145145
node('CPU-SMALL') {
146146
ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
147-
docker_init(ci_wasm)
148147
init_git()
148+
docker_init(ci_wasm)
149149
sh (
150150
script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
151151
label: 'Create WASM cmake config',
@@ -169,8 +169,8 @@ stage('Build') {
169169
if (!skip_ci && is_docs_only_build != 1) {
170170
node('CPU-SMALL') {
171171
ws({{ m.per_exec_ws('tvm/build-i386') }}) {
172-
docker_init(ci_i386)
173172
init_git()
173+
docker_init(ci_i386)
174174
sh (
175175
script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
176176
label: 'Create i386 cmake config',
@@ -187,8 +187,8 @@ stage('Build') {
187187
if (!skip_ci && is_docs_only_build != 1) {
188188
node('ARM-SMALL') {
189189
ws({{ m.per_exec_ws('tvm/build-arm') }}) {
190-
docker_init(ci_arm)
191190
init_git()
191+
docker_init(ci_arm)
192192
sh (
193193
script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
194194
label: 'Create ARM cmake config',
@@ -205,8 +205,8 @@ stage('Build') {
205205
if (!skip_ci && is_docs_only_build != 1) {
206206
node('CPU-SMALL') {
207207
ws({{ m.per_exec_ws('tvm/build-cortexm') }}) {
208-
docker_init(ci_cortexm)
209208
init_git()
209+
docker_init(ci_cortexm)
210210
sh (
211211
script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
212212
label: 'Create Cortex-M cmake config',
@@ -223,8 +223,8 @@ stage('Build') {
223223
if (!skip_ci && is_docs_only_build != 1) {
224224
node('CPU-SMALL') {
225225
ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
226-
docker_init(ci_hexagon)
227226
init_git()
227+
docker_init(ci_hexagon)
228228
sh (
229229
script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
230230
label: 'Create Hexagon cmake config',
@@ -245,8 +245,8 @@ stage('Build') {
245245
if (!skip_ci && is_docs_only_build != 1) {
246246
node('CPU-SMALL') {
247247
ws({{ m.per_exec_ws('tvm/build-riscv') }}) {
248-
docker_init(ci_riscv)
249248
init_git()
249+
docker_init(ci_riscv)
250250
sh (
251251
script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
252252
label: 'Create RISC-V cmake config',

ci/jenkins/Deploy.groovy.j2

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,11 @@ def update_docker(ecr_image, hub_image) {
3030
sh(
3131
script: """
3232
set -eux
33+
. ci/scripts/retry.sh
3334
docker tag \
3435
${ecr_image} \
3536
${hub_image}
36-
docker push ${hub_image}
37+
retry 5 docker push ${hub_image}
3738
""",
3839
label: "Update ${hub_image} on Docker Hub",
3940
)
@@ -144,9 +145,10 @@ def deploy() {
144145
sh(
145146
script: """
146147
set -eux
148+
. ci/scripts/retry.sh
147149
docker pull tlcpackstaging/{{ image.name }}:${tag}
148150
docker tag tlcpackstaging/{{ image.name }}:${tag} tlcpack/{{ image.name.replace("_", "-") }}:${tag}
149-
docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag}
151+
retry 5 docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag}
150152
""",
151153
label: 'Tag tlcpackstaging/{{ image.name }} image to tlcpack',
152154
)

ci/jenkins/DockerBuild.groovy.j2

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ def ecr_push(full_name) {
2121
sh(
2222
script: """
2323
set -x
24+
. ci/scripts/retry.sh
2425
docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
25-
docker push \$AWS_ECR_REPO/${full_name}
26+
retry 5 docker push \$AWS_ECR_REPO/${full_name}
2627
""",
2728
label: 'Upload image to ECR'
2829
)
@@ -63,7 +64,8 @@ def ecr_pull(full_name) {
6364
sh(
6465
script: """
6566
set -eux
66-
docker pull ${full_name}
67+
. ci/scripts/retry.sh
68+
retry 5 docker pull ${full_name}
6769
""",
6870
label: 'Pull image from ECR'
6971
)

ci/jenkins/Prepare.groovy.j2

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def init_git() {
3333
sh(
3434
script: """
3535
set -eux
36-
{{ m.bash_retry() }}
36+
. ci/scripts/retry.sh
3737
retry 3 timeout 5m git submodule update --init -f --jobs 0
3838
""",
3939
label: 'Update git submodules',
@@ -65,8 +65,8 @@ def docker_init(image) {
6565
sh(
6666
script: """
6767
set -eux
68-
{{ m.bash_retry() }}
69-
retry 3 docker pull ${image}
68+
. ci/scripts/retry.sh
69+
retry 5 docker pull ${image}
7070
""",
7171
label: 'Pull docker image',
7272
)

ci/jenkins/Test.groovy.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,8 @@ stage('Test') {
294294
if (!skip_ci) {
295295
node('GPU') {
296296
ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
297-
docker_init(ci_gpu)
298297
init_git()
298+
docker_init(ci_gpu)
299299
{{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
300300
add_microtvm_permissions()
301301
timeout(time: 180, unit: 'MINUTES') {

ci/jenkins/macros.j2

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ def {{ method_name }}() {
3939
node('{{ node }}') {
4040
ws({{ per_exec_ws(ws) }}) {
4141
try {
42-
docker_init({{ docker_image }})
4342
init_git()
43+
docker_init({{ docker_image }})
4444
timeout(time: max_time, unit: 'MINUTES') {
4545
withEnv([
4646
'PLATFORM={{ platform }}',
@@ -71,8 +71,8 @@ def {{ method_name }}() {
7171
'{{ name }} {{ shard_index }} of {{ num_shards }}': {
7272
node('{{ node }}') {
7373
ws({{ per_exec_ws(ws) }}) {
74-
docker_init({{ docker_image }})
7574
init_git()
75+
docker_init({{ docker_image }})
7676
timeout(time: max_time, unit: 'MINUTES') {
7777
withEnv([
7878
'TVM_NUM_SHARDS={{ num_shards }}',
@@ -95,8 +95,8 @@ def {{ method_name }}() {
9595
ws({{ per_exec_ws(ws) }}) {
9696
timeout(time: max_time, unit: 'MINUTES') {
9797
try {
98-
docker_init({{ docker_image }})
9998
init_git()
99+
docker_init({{ docker_image }})
100100
withEnv(['PLATFORM={{ platform }}'], {
101101
{{ caller() | indent(width=8) | trim }}
102102
})
@@ -120,8 +120,8 @@ def {{ method_name }}() {
120120
ws({{ per_exec_ws(ws) }}) {
121121
timeout(time: max_time, unit: 'MINUTES') {
122122
try {
123-
docker_init({{ docker_image }})
124123
init_git()
124+
docker_init({{ docker_image }})
125125
withEnv(['PLATFORM={{ platform }}',
126126
'TEST_STEP_NAME={{ name }}',
127127
"SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -140,28 +140,6 @@ def {{ method_name }}() {
140140
},
141141
{% endmacro %}
142142

143-
{% macro bash_retry() %}
144-
retry() {
145-
local max_retries=\$1
146-
shift
147-
local n=0
148-
local backoff_max=30
149-
until [ "\$n" -ge \$max_retries ]
150-
do
151-
"\$@" && break
152-
n=\$((n+1))
153-
if [ "\$n" -eq \$max_retries ]; then
154-
echo "failed to update after attempt \$n / \$max_retries, giving up"
155-
exit 1
156-
fi
157-
158-
WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
159-
echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
160-
sleep \$WAIT
161-
done
162-
}
163-
{% endmacro %}
164-
165143
{% macro deploy_step(name, feature_flag, ws) %}
166144
'{{ name }}': {
167145
if ({{ feature_flag }}) {
@@ -182,7 +160,7 @@ retry() {
182160
sh(
183161
script: """
184162
set -eux
185-
{{ bash_retry() | indent(width=14) }}
163+
. ci/scripts/retry.sh
186164
{% for filename in filenames %}
187165
md5sum {{ filename }}
188166
retry 3 aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }}
@@ -199,7 +177,7 @@ sh(
199177
sh(
200178
script: """
201179
set -eux
202-
{{ bash_retry() | indent(width=14) }}
180+
. ci/scripts/retry.sh
203181
{% for filename in filenames %}
204182
retry 3 aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ filename }} {{ filename }}
205183
md5sum {{ filename }}

ci/scripts/retry.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env bash
2+
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
20+
set -eux
21+
22+
retry() {
23+
local max_retries=$1
24+
shift
25+
local n=0
26+
until [ "$n" -ge "$max_retries" ]
27+
do
28+
"$@" && break
29+
n=$((n+1))
30+
if [ "$n" -eq "$max_retries" ]; then
31+
echo "failed to update after attempt $n / $max_retries, giving up"
32+
exit 1
33+
fi
34+
35+
WAIT=$(python3 -c 'import random; print(random.randint(10, 30))')
36+
echo "failed to update $n / $max_retries, waiting $WAIT to try again"
37+
sleep "$WAIT"
38+
done
39+
}

0 commit comments

Comments
 (0)