Nvidia-competition setup (profiling + B200 runners) (#368)

ngc92 · msaroufim · S1ro1 · web-flow · commit c1973b670fce · 2025-11-10T20:26:18.000+01:00
* Change runner from gpumode-nvidia-arc to Nvidia-A100

* Update nvidia-arc-health.yml

* Update nvidia-arc-health.yml

* Feat: run health on b200

* tmp

* tmp

* tmp

* feat

* feat

* feat

* replace nvidia workflow to point to our b200 cluster

* Fix: container

* Fix: python-&gt;python3

* Fix: add back deps

* Fix: python-&gt;python3

* Fix: python-&gt;python3

* Add nvidia-smi

* split profiling into rocm/ncu;
small code improvements

* profile each benchmark individually for cleaner traces

* profile in tempdir

* send profile results as attached files

* don't spam alerts

* include default ncu report

* attempt at filtered ncu

* formatting fix

* fix tests

* Fix: good error for profile via api

* Fix: remove nvidia-smi from workflow

* Fix: polling time to 15s

* limit profiling report length

* limit number of kernels to be profiled

* stricter matching for kernel name lines

* add an additional safety limit to ncu reports

* fix

* Fix: style

---------

Co-authored-by: Mark Saroufim &lt;marksaroufim@meta.com&gt;
Co-authored-by: S1ro1 &lt;matej.sirovatka@gmail.com&gt;
Co-authored-by: Alex Zhang &lt;alex.lx.zhang@gmail.com&gt;
diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
@@ -6,27 +6,15 @@ on:
     - cron: '0 2 * * *'
   workflow_dispatch:
   push:
-    branches: [main]
 
 jobs:
   health-check:
-    runs-on: [gpumode-nvidia-arc]
+    runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 5
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     
     steps:
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-    
-    - name: Install PyTorch
-      run: |
-        pip install torch
-    
     - name: GPU Health Check
-      run: python -c "import torch; torch.randn(5, device='cuda')"
+      run: python3 -c "import torch; torch.randn(5, device='cuda')"
     
     env:
       CUDA_VISIBLE_DEVICES: 0
diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
@@ -19,23 +19,11 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}'
 
 jobs:
   run:
-    runs-on: [gpumode-nvidia-arc]
+    runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 10
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
     - uses: actions/checkout@v3
 
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-
-    - name: Install uv
-      uses: astral-sh/setup-uv@v3
-      with:
-        version: "latest"
-
     - name: Create input files
       shell: bash
       run: |
@@ -49,30 +37,18 @@ jobs:
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
 
-    - name: Install uv
-      uses: astral-sh/setup-uv@v3
-      with:
-        version: "latest"
-
-    - name: Setup Python environment
+    - name: Setup Virtual Environment and Install Dependencies
       shell: bash
       run: |
-        uv venv .venv
-        echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
-        echo "$PWD/.venv/bin" >> $GITHUB_PATH
+        pip install --upgrade pip
+        pip install -r "requirements.txt"
+        pip install -e .
 
-        if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
-          cat > "requirements.txt" <<'EOL'
-          ${{ github.event.inputs.requirements }}
-        EOL
-        uv pip install -r "requirements.txt"
-        fi
-        uv pip install -e .
 
     - name: Run script
       shell: bash
       run: |
-        python src/runners/github-runner.py
+        python3 src/runners/github-runner.py
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4
@@ -88,5 +64,3 @@ jobs:
         name: profile-data
         path: profile_data/*
         retention-days: 1
-    env:
-      CUDA_VISIBLE_DEVICES: 0
diff --git a/examples/eval.py b/examples/eval.py
@@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
         return 112
 
 
-def _run_single_profile(test: TestCase) -> str:
+def _run_single_profile_torch(test: TestCase) -> str:
     """
-    Runs a single test case. Do not call directly
+    Profiles a single benchmark using the torch profiler.
     """
     from submission import custom_kernel
     from torch.profiler import profile, ProfilerActivity
@@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str:
         data = generate_input(**test.args)
         torch.cuda.synchronize()
 
+    cloned = _clone_data(data, 0)
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
         with nvtx_range("custom_kernel"):
-            submission_output = custom_kernel(_clone_data(data, 0))
+            submission_output = custom_kernel(cloned)
             torch.cuda.synchronize()
 
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
 
+def _run_single_profile_ncu(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using ncu. Note: this does not
+    invoke NCU; instead, it is expected that eval is launched
+    under NCU, and this function will rurnthe kernel excactly
+    once in the 'custom_kernel' nvtx range.
+    """
+    from submission import custom_kernel
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data, 0)
+    with nvtx_range("custom_kernel"):
+        submission_output = custom_kernel(cloned)
+        torch.cuda.synchronize()
+
+    return ""
+
+
 def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
     """
     Runs a single profiling case. Do not call directly
@@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
     """
     world_size = test.args.get("world_size", None)
     if world_size is None:
-        return pool.apply(_run_single_profile, (test,))
+        if bool(os.getenv("POPCORN_NCU", "0")):
+            return pool.apply(_run_single_profile_ncu, (test,))
+        else:
+            return pool.apply(_run_single_profile_torch, (test,))
     else:
         return run_multi_gpu_profile(pool, test, world_size)
 
diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
@@ -19,12 +19,12 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
         headers = header_files
 
     eval_result = run_cuda_script(
-        make_system_info(),
         sources,
         headers,
         arch=arch,
         mode=SubmissionMode.TEST.value,
         tests="size: 256; seed: 42\n",
+        system=make_system_info(),
         **kwargs,
     )
     return eval_result.compilation, eval_result.run
@@ -195,12 +195,12 @@ def test_include_dirs(tmp_path: Path):
 
     # can also use generic flags argument
     result = run_cuda_script(
-        make_system_info(),
         {"eval.cu": eval_cu, "submission.cu": sub},
         header_files,
         flags=["-I.", f"-I{tmp_path}"],
         mode=SubmissionMode.TEST.value,
         tests="size: 256; seed: 42\n",
+        system=make_system_info(),
     )
 
     assert result.compilation.success is True
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
@@ -12,11 +12,11 @@
 
 def run_pytorch_helper(sources: dict, tests=None, **kwargs):
     result = run_pytorch_script(
-        make_system_info(),
         sources,
         "eval.py",
         mode=SubmissionMode.TEST.value,
         tests=tests or "size: 256; seed: 42\n",
+        system=make_system_info(),
         **kwargs,
     )
     return result.run
@@ -45,7 +45,7 @@ def custom_kernel(input):
     run = run_pytorch_helper({**files, "submission.py": sub})
     assert run.success is True
     assert run.passed is False
-    assert "python eval.py test" in run.command
+    assert "python3 eval.py test" in run.command
     assert run.stdout == ""
     assert run.stderr == ""
 
diff --git a/src/kernelbot/api/api_utils.py b/src/kernelbot/api/api_utils.py
@@ -189,6 +189,8 @@ async def display_report(self, title: str, report: RunResultReport):
             elif isinstance(part, Log):
                 self.long_report += f"\n\n## {part.header}:\n"
                 self.long_report += f"```\n{part.content}```"
+
+
 # ruff: noqa: C901
 async def to_submit_info(
     user_info: Any,
@@ -197,14 +199,12 @@ async def to_submit_info(
     leaderboard_name: str,
     gpu_type: str,
     db_context: LeaderboardDB,
-) -> tuple[SubmissionRequest, SubmissionMode]: # noqa: C901
+) -> tuple[SubmissionRequest, SubmissionMode]:  # noqa: C901
     user_name = user_info["user_name"]
     user_id = user_info["user_id"]
 
     try:
-        submission_mode_enum: SubmissionMode = SubmissionMode(
-            submission_mode.lower()
-        )
+        submission_mode_enum: SubmissionMode = SubmissionMode(submission_mode.lower())
     except ValueError:
         raise HTTPException(
             status_code=400,
@@ -222,6 +222,11 @@ async def to_submit_info(
         SubmissionMode.BENCHMARK,
         SubmissionMode.LEADERBOARD,
     ]
+    if submission_mode_enum == SubmissionMode.PROFILE:
+        raise HTTPException(
+            status_code=400,
+            detail="Profile submissions are not currently supported via API, use Discord instead.",
+        )
     if submission_mode_enum not in allowed_modes:
         raise HTTPException(
             status_code=400,
@@ -263,9 +268,7 @@ async def to_submit_info(
     except HTTPException:
         raise
     except Exception as e:
-        raise HTTPException(
-            status_code=400, detail=f"Error reading submission file: {e}"
-        ) from e
+        raise HTTPException(status_code=400, detail=f"Error reading submission file: {e}") from e
 
     try:
         submission_code = submission_content.decode("utf-8")
diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py
@@ -1,7 +1,8 @@
 import discord
-from discord_utils import _send_split_log
+from discord_utils import _send_file, _send_split_log
 
 from libkernelbot.report import (
+    File,
     Link,
     Log,
     MultiProgressReporter,
@@ -70,6 +71,11 @@ async def display_report(self, title: str, report: RunResultReport):
                 message += part.text
             elif isinstance(part, Log):
                 message = await _send_split_log(thread, message, part.header, part.content)
+            elif isinstance(part, File):
+                if len(message) > 0:
+                    await thread.send(message)
+                await _send_file(thread, part.message, part.name, part.content)
+                message = ""
             elif isinstance(part, Link):
                 if len(message) > 0:
                     await thread.send(message)
diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py
@@ -1,5 +1,6 @@
 import functools
 import logging
+from io import BytesIO
 
 import discord
 
@@ -124,7 +125,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
             else:
                 if partial_message != "":
                     chunks.append(partial_message)
-                partial_message = line
+                partial_message = line + "\n"
 
         if partial_message != "":
             chunks.append(partial_message)
@@ -133,6 +134,10 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
         for i, chunk in enumerate(chunks):
             partial_message = f"\n\n## {header} ({i+1}/{len(chunks)}):\n"
             partial_message += f"```\n{limit_length(chunk, 1900)}```"
-            await thread.send(partial_message)
+            await thread.send(partial_message, silent=True)
 
         return ""
+
+
+async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes):
+    await thread.send(message, file=discord.File(BytesIO(file), filename=name), silent=True)
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
@@ -143,7 +143,7 @@ async def run_submission(  # noqa: C901
             # Update profile artifact to the actual download URL.
             # For the GitHub launcher the profile_artifact currently just contains
             # the name of the artifact.
-            if profile_res is not None:
+            if profile_res is not None and "profile-data" in index:
                 profile_res.download_url = index["profile-data"].public_download_url
 
             res = EvalResult(
@@ -344,7 +344,7 @@ async def wait_for_completion(
                     return
 
                 await callback(self)
-                await asyncio.sleep(20)  # Yield control while waiting
+                await asyncio.sleep(15)  # Yield control while waiting
             except TimeoutError:
                 raise  # Re-raise the specific TimeoutError from the timeout block
             except Exception as e:
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py
diff --git a/tests/test_report.py b/tests/test_report.py