@@ -111,9 +111,24 @@ jobs:
111111 echo "::set-output name=matrix::$keys"
112112 echo "::set-output name=test_map::$test_map"
113113
114+ run_check_runners :
115+ name : Check Runners
116+ needs : setup
117+ strategy :
118+ matrix :
119+ machine_type : [single-gpu, multi-gpu]
120+ runs-on : [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
121+ container :
122+ image : huggingface/transformers-all-latest-gpu
123+ options : --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
124+ steps :
125+ - name : NVIDIA-SMI
126+ run : |
127+ nvidia-smi
128+
114129 run_tests_single_gpu :
115130 name : Model tests
116- needs : setup
131+ needs : [ setup, run_check_runners]
117132 # `dummy` means there is no test to run
118133 if : contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
119134 strategy :
@@ -198,7 +213,7 @@ jobs:
198213
199214 run_tests_multi_gpu :
200215 name : Model tests
201- needs : setup
216+ needs : [ setup, run_check_runners]
202217 # `dummy` means there is no test to run
203218 if : contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
204219 strategy :
@@ -285,7 +300,7 @@ jobs:
285300
286301 run_tests_torch_cuda_extensions_single_gpu :
287302 name : Torch CUDA extension tests
288- needs : setup
303+ needs : [ setup, run_check_runners]
289304 if : contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
290305 strategy :
291306 fail-fast : false
@@ -364,7 +379,7 @@ jobs:
364379
365380 run_tests_torch_cuda_extensions_multi_gpu :
366381 name : Torch CUDA extension tests
367- needs : setup
382+ needs : [ setup, run_check_runners]
368383 if : contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
369384 strategy :
370385 fail-fast : false
@@ -447,12 +462,20 @@ jobs:
447462 if : always()
448463 needs : [
449464 setup,
465+ run_check_runners,
450466 run_tests_single_gpu,
451467 run_tests_multi_gpu,
452468 run_tests_torch_cuda_extensions_single_gpu,
453469 run_tests_torch_cuda_extensions_multi_gpu
454470 ]
455471 steps :
472+ - name : Preliminary job status
473+ shell : bash
474+ # For the meaning of these environment variables, see the job `Setup`
475+ run : |
476+ echo "Setup status: ${{ needs.setup.result }}"
477+ echo "Runner status: ${{ needs.run_check_runners.result }}"
478+
456479 # Necessary to get the correct branch name and commit SHA for `workflow_run` event
457480 # We also take into account the `push` event (we might want to test some changes in a branch)
458481 - name : Prepare custom environment variables
@@ -498,6 +521,9 @@ jobs:
498521 CI_TITLE_PUSH : ${{ github.event.head_commit.message }}
499522 CI_TITLE_WORKFLOW_RUN : ${{ github.event.workflow_run.head_commit.message }}
500523 CI_SHA : ${{ env.CI_SHA }}
524+ SETUP_STATUS : ${{ needs.setup.result }}
525+ RUNNER_STATUS : ${{ needs.run_check_runners.result }}
526+
501527 # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
502528 # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
503529 run : |
0 commit comments