Skip to content

Commit 81ab111

Browse files
authored
Add checks for some workflow jobs (#18583)
Co-authored-by: ydshieh <[email protected]>
1 parent 510c2a0 commit 81ab111

File tree

2 files changed

+80
-21
lines changed

2 files changed

+80
-21
lines changed

.github/workflows/self-push.yml

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,24 @@ jobs:
111111
echo "::set-output name=matrix::$keys"
112112
echo "::set-output name=test_map::$test_map"
113113
114+
run_check_runners:
115+
name: Check Runners
116+
needs: setup
117+
strategy:
118+
matrix:
119+
machine_type: [single-gpu, multi-gpu]
120+
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
121+
container:
122+
image: huggingface/transformers-all-latest-gpu
123+
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
124+
steps:
125+
- name: NVIDIA-SMI
126+
run: |
127+
nvidia-smi
128+
114129
run_tests_single_gpu:
115130
name: Model tests
116-
needs: setup
131+
needs: [setup, run_check_runners]
117132
# `dummy` means there is no test to run
118133
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
119134
strategy:
@@ -198,7 +213,7 @@ jobs:
198213

199214
run_tests_multi_gpu:
200215
name: Model tests
201-
needs: setup
216+
needs: [setup, run_check_runners]
202217
# `dummy` means there is no test to run
203218
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
204219
strategy:
@@ -285,7 +300,7 @@ jobs:
285300

286301
run_tests_torch_cuda_extensions_single_gpu:
287302
name: Torch CUDA extension tests
288-
needs: setup
303+
needs: [setup, run_check_runners]
289304
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
290305
strategy:
291306
fail-fast: false
@@ -364,7 +379,7 @@ jobs:
364379

365380
run_tests_torch_cuda_extensions_multi_gpu:
366381
name: Torch CUDA extension tests
367-
needs: setup
382+
needs: [setup, run_check_runners]
368383
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
369384
strategy:
370385
fail-fast: false
@@ -447,12 +462,20 @@ jobs:
447462
if: always()
448463
needs: [
449464
setup,
465+
run_check_runners,
450466
run_tests_single_gpu,
451467
run_tests_multi_gpu,
452468
run_tests_torch_cuda_extensions_single_gpu,
453469
run_tests_torch_cuda_extensions_multi_gpu
454470
]
455471
steps:
472+
- name: Preliminary job status
473+
shell: bash
474+
# For the meaning of these environment variables, see the job `Setup`
475+
run: |
476+
echo "Setup status: ${{ needs.setup.result }}"
477+
echo "Runner status: ${{ needs.run_check_runners.result }}"
478+
456479
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
457480
# We also take into account the `push` event (we might want to test some changes in a branch)
458481
- name: Prepare custom environment variables
@@ -498,6 +521,9 @@ jobs:
498521
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
499522
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
500523
CI_SHA: ${{ env.CI_SHA }}
524+
SETUP_STATUS: ${{ needs.setup.result }}
525+
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
526+
501527
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
502528
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
503529
run: |

utils/notification_service.py

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -387,28 +387,52 @@ def payload(self) -> str:
387387
return json.dumps(blocks)
388388

389389
@staticmethod
390-
def error_out():
391-
payload = [
392-
{
393-
"type": "section",
394-
"text": {
395-
"type": "plain_text",
396-
"text": "There was an issue running the tests.",
397-
},
398-
"accessory": {
399-
"type": "button",
400-
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
401-
"url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
402-
},
403-
}
404-
]
390+
def error_out(title, ci_title="", setup_failed=False, runner_failed=False):
391+
392+
blocks = []
393+
title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
394+
blocks.append(title_block)
395+
396+
if ci_title:
397+
ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
398+
blocks.append(ci_title_block)
399+
400+
if setup_failed:
401+
text = "💔 Setup job failed. Tests are not run. 😭"
402+
elif runner_failed:
403+
text = "💔 CI runners have problems! Tests are not run. 😭"
404+
else:
405+
text = "💔 There was an issue running the tests. 😭"
406+
407+
error_block_1 = {
408+
"type": "header",
409+
"text": {
410+
"type": "plain_text",
411+
"text": text,
412+
},
413+
}
414+
error_block_2 = {
415+
"type": "section",
416+
"text": {
417+
"type": "plain_text",
418+
"text": "🙏 Let's fix it ASAP! 🙏",
419+
},
420+
"accessory": {
421+
"type": "button",
422+
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
423+
"url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
424+
},
425+
}
426+
blocks.extend([error_block_1, error_block_2])
427+
428+
payload = json.dumps(blocks)
405429

406430
print("Sending the following payload")
407-
print(json.dumps({"blocks": json.loads(payload)}))
431+
print(json.dumps({"blocks": blocks}))
408432

409433
client.chat_postMessage(
410434
channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
411-
text="There was an issue running the tests.",
435+
text=text,
412436
blocks=payload,
413437
)
414438

@@ -630,6 +654,11 @@ def prepare_reports(title, header, reports, to_truncate=True):
630654

631655
if __name__ == "__main__":
632656

657+
setup_status = os.environ.get("SETUP_STATUS")
658+
runner_status = os.environ.get("RUNNER_STATUS")
659+
setup_failed = True if setup_status is not None and setup_status != "success" else False
660+
runner_failed = True if runner_status is not None and runner_status != "success" else False
661+
633662
org = "huggingface"
634663
repo = "transformers"
635664
repository_full_name = f"{org}/{repo}"
@@ -689,6 +718,10 @@ def prepare_reports(title, header, reports, to_truncate=True):
689718
else:
690719
ci_title = ""
691720

721+
if setup_failed or runner_failed:
722+
Message.error_out(title, ci_title, setup_failed, runner_failed)
723+
exit(0)
724+
692725
arguments = sys.argv[1:][0]
693726
try:
694727
models = ast.literal_eval(arguments)

0 commit comments

Comments
 (0)