Add checks for some workflow jobs (#18583)

ydshieh · web-flow · commit 81ab11124f24 · 2022-08-16T13:53:47.000+02:00
Co-authored-by: ydshieh &lt;ydshieh@users.noreply.github.com&gt;
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
@@ -111,9 +111,24 @@ jobs:
           echo "::set-output name=matrix::$keys"
           echo "::set-output name=test_map::$test_map"
 
+  run_check_runners:
+    name: Check Runners
+    needs: setup
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
   run_tests_single_gpu:
     name: Model tests
-    needs: setup
+    needs: [setup, run_check_runners]
     # `dummy` means there is no test to run
     if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
     strategy:
@@ -198,7 +213,7 @@ jobs:
 
   run_tests_multi_gpu:
     name: Model tests
-    needs: setup
+    needs: [setup, run_check_runners]
     # `dummy` means there is no test to run
     if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
     strategy:
@@ -285,7 +300,7 @@ jobs:
 
   run_tests_torch_cuda_extensions_single_gpu:
     name: Torch CUDA extension tests
-    needs: setup
+    needs: [setup, run_check_runners]
     if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
     strategy:
       fail-fast: false
@@ -364,7 +379,7 @@ jobs:
 
   run_tests_torch_cuda_extensions_multi_gpu:
     name: Torch CUDA extension tests
-    needs: setup
+    needs: [setup, run_check_runners]
     if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
     strategy:
       fail-fast: false
@@ -447,12 +462,20 @@ jobs:
     if: always()
     needs: [
         setup,
+        run_check_runners,
         run_tests_single_gpu,
         run_tests_multi_gpu,
         run_tests_torch_cuda_extensions_single_gpu,
         run_tests_torch_cuda_extensions_multi_gpu
     ]
     steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Setup status: ${{ needs.setup.result }}"
+          echo "Runner status: ${{ needs.run_check_runners.result }}"
+
       # Necessary to get the correct branch name and commit SHA for `workflow_run` event
       # We also take into account the `push` event (we might want to test some changes in a branch)
       - name: Prepare custom environment variables
@@ -498,6 +521,9 @@ jobs:
           CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
           CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
           CI_SHA: ${{ env.CI_SHA }}
+          SETUP_STATUS: ${{ needs.setup.result }}
+          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
+
         # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
         run: |
diff --git a/utils/notification_service.py b/utils/notification_service.py
@@ -387,28 +387,52 @@ def payload(self) -> str:
         return json.dumps(blocks)
 
     @staticmethod
-    def error_out():
-        payload = [
-            {
-                "type": "section",
-                "text": {
-                    "type": "plain_text",
-                    "text": "There was an issue running the tests.",
-                },
-                "accessory": {
-                    "type": "button",
-                    "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
-                    "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
-                },
-            }
-        ]
+    def error_out(title, ci_title="", setup_failed=False, runner_failed=False):
+
+        blocks = []
+        title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
+        blocks.append(title_block)
+
+        if ci_title:
+            ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
+            blocks.append(ci_title_block)
+
+        if setup_failed:
+            text = "💔 Setup job failed. Tests are not run. 😭"
+        elif runner_failed:
+            text = "💔 CI runners have problems! Tests are not run. 😭"
+        else:
+            text = "💔 There was an issue running the tests. 😭"
+
+        error_block_1 = {
+            "type": "header",
+            "text": {
+                "type": "plain_text",
+                "text": text,
+            },
+        }
+        error_block_2 = {
+            "type": "section",
+            "text": {
+                "type": "plain_text",
+                "text": "🙏 Let's fix it ASAP! 🙏",
+            },
+            "accessory": {
+                "type": "button",
+                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
+                "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
+            },
+        }
+        blocks.extend([error_block_1, error_block_2])
+
+        payload = json.dumps(blocks)
 
         print("Sending the following payload")
-        print(json.dumps({"blocks": json.loads(payload)}))
+        print(json.dumps({"blocks": blocks}))
 
         client.chat_postMessage(
             channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
-            text="There was an issue running the tests.",
+            text=text,
             blocks=payload,
         )
 
@@ -630,6 +654,11 @@ def prepare_reports(title, header, reports, to_truncate=True):
 
 if __name__ == "__main__":
 
+    setup_status = os.environ.get("SETUP_STATUS")
+    runner_status = os.environ.get("RUNNER_STATUS")
+    setup_failed = True if setup_status is not None and setup_status != "success" else False
+    runner_failed = True if runner_status is not None and runner_status != "success" else False
+
     org = "huggingface"
     repo = "transformers"
     repository_full_name = f"{org}/{repo}"
@@ -689,6 +718,10 @@ def prepare_reports(title, header, reports, to_truncate=True):
     else:
         ci_title = ""
 
+    if setup_failed or runner_failed:
+        Message.error_out(title, ci_title, setup_failed, runner_failed)
+        exit(0)
+
     arguments = sys.argv[1:][0]
     try:
         models = ast.literal_eval(arguments)