gpu-mode · ngc92 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Nov 1, 2025
diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
@@ -6,27 +6,15 @@ on:
     - cron: '0 2 * * *'
   workflow_dispatch:
   push:
-    branches: [main]
 
 jobs:
   health-check:
-    runs-on: [gpumode-nvidia-arc]
+    runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 5
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
 
     steps:
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-
-    - name: Install PyTorch
-      run: |
-        pip install torch
-
     - name: GPU Health Check
-      run: python -c "import torch; torch.randn(5, device='cuda')"
+      run: python3 -c "import torch; torch.randn(5, device='cuda')"
 
     env:
       CUDA_VISIBLE_DEVICES: 0
diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
@@ -19,22 +19,15 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}'
 
 jobs:
   run:
-    runs-on: [gpumode-nvidia-arc]
+    runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 10
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
     - uses: actions/checkout@v3
 
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-
-    - name: Install uv
-      uses: astral-sh/setup-uv@v3
-      with:
-        version: "latest"
+    - name: nvidia-smi
+      shell: bash
+      run: |
+        nvidia-smi || echo "nvidia-smi failed"
 
     - name: Create input files
       shell: bash
@@ -49,30 +42,18 @@ jobs:
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
 
-    - name: Install uv
-      uses: astral-sh/setup-uv@v3
-      with:
-        version: "latest"
-
-    - name: Setup Python environment
+    - name: Setup Virtual Environment and Install Dependencies
       shell: bash
       run: |
-        uv venv .venv
-        echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
-        echo "$PWD/.venv/bin" >> $GITHUB_PATH
+        pip install --upgrade pip
+        pip install -r "requirements.txt"
+        pip install -e .
 
-        if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
-          cat > "requirements.txt" <<'EOL'
-          ${{ github.event.inputs.requirements }}
-        EOL
-        uv pip install -r "requirements.txt"
-        fi
-        uv pip install -e .
 
     - name: Run script
       shell: bash
       run: |
-        python src/runners/github-runner.py
+        python3 src/runners/github-runner.py
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4
@@ -88,5 +69,3 @@ jobs:
         name: profile-data
         path: profile_data/*
         retention-days: 1
-    env:
-      CUDA_VISIBLE_DEVICES: 0
diff --git a/examples/eval.py b/examples/eval.py
@@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
         return 112
 
 
-def _run_single_profile(test: TestCase) -> str:
+def _run_single_profile_torch(test: TestCase) -> str:
     """
-    Runs a single test case. Do not call directly
+    Profiles a single benchmark using the torch profiler.
     """
     from submission import custom_kernel
     from torch.profiler import profile, ProfilerActivity
@@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str:
         data = generate_input(**test.args)
         torch.cuda.synchronize()
 
+    cloned = _clone_data(data, 0)
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
         with nvtx_range("custom_kernel"):
-            submission_output = custom_kernel(_clone_data(data, 0))
+            submission_output = custom_kernel(cloned)
             torch.cuda.synchronize()
 
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
 
+def _run_single_profile_ncu(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using ncu. Note: this does not
+    invoke NCU; instead, it is expected that eval is launched
+    under NCU, and this function will rurnthe kernel excactly
+    once in the 'custom_kernel' nvtx range.
+    """
+    from submission import custom_kernel
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data, 0)
+    with nvtx_range("custom_kernel"):
+        submission_output = custom_kernel(cloned)
+        torch.cuda.synchronize()
+
+    return ""
+
+
 def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
     """
     Runs a single profiling case. Do not call directly
@@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
     """
     world_size = test.args.get("world_size", None)
     if world_size is None:
-        return pool.apply(_run_single_profile, (test,))
+        if bool(os.getenv("POPCORN_NCU", "0")):
+            return pool.apply(_run_single_profile_ncu, (test,))
+        else:
+            return pool.apply(_run_single_profile_torch, (test,))
     else:
         return run_multi_gpu_profile(pool, test, world_size)
 

diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
@@ -19,12 +19,12 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
         headers = header_files
 
     eval_result = run_cuda_script(
-        make_system_info(),
         sources,
         headers,
         arch=arch,
         mode=SubmissionMode.TEST.value,
         tests="size: 256; seed: 42\n",
+        system=make_system_info(),
         **kwargs,
     )
     return eval_result.compilation, eval_result.run
@@ -195,12 +195,12 @@ def test_include_dirs(tmp_path: Path):
 
     # can also use generic flags argument
     result = run_cuda_script(
-        make_system_info(),
         {"eval.cu": eval_cu, "submission.cu": sub},
         header_files,
         flags=["-I.", f"-I{tmp_path}"],
         mode=SubmissionMode.TEST.value,
         tests="size: 256; seed: 42\n",
+        system=make_system_info(),
     )
 
     assert result.compilation.success is True

diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
@@ -12,11 +12,11 @@
 
 def run_pytorch_helper(sources: dict, tests=None, **kwargs):
     result = run_pytorch_script(
-        make_system_info(),
         sources,
         "eval.py",
         mode=SubmissionMode.TEST.value,
         tests=tests or "size: 256; seed: 42\n",
+        system=make_system_info(),
         **kwargs,
     )
     return result.run
@@ -45,7 +45,7 @@ def custom_kernel(input):
     run = run_pytorch_helper({**files, "submission.py": sub})
     assert run.success is True
     assert run.passed is False
-    assert "python eval.py test" in run.command
+    assert "python3 eval.py test" in run.command
     assert run.stdout == ""
     assert run.stderr == ""
 

diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py
@@ -1,7 +1,8 @@
 import discord
-from discord_utils import _send_split_log
+from discord_utils import _send_file, _send_split_log
 
 from libkernelbot.report import (
+    File,
     Link,
     Log,
     MultiProgressReporter,
@@ -70,6 +71,11 @@ async def display_report(self, title: str, report: RunResultReport):
                 message += part.text
             elif isinstance(part, Log):
                 message = await _send_split_log(thread, message, part.header, part.content)
+            elif isinstance(part, File):
+                if len(message) > 0:
+                    await thread.send(message)
+                await _send_file(thread, part.message, part.name, part.content)
+                message = ""
             elif isinstance(part, Link):
                 if len(message) > 0:
                     await thread.send(message)

diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py
@@ -1,5 +1,6 @@
 import functools
 import logging
+from io import BytesIO
 
 import discord
 
@@ -124,7 +125,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
             else:
                 if partial_message != "":
                     chunks.append(partial_message)
-                partial_message = line
+                partial_message = line + "\n"
 
         if partial_message != "":
             chunks.append(partial_message)
@@ -133,6 +134,10 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
         for i, chunk in enumerate(chunks):
             partial_message = f"\n\n## {header} ({i+1}/{len(chunks)}):\n"
             partial_message += f"```\n{limit_length(chunk, 1900)}```"
-            await thread.send(partial_message)
+            await thread.send(partial_message, silent=True)
 
         return ""
+
+
+async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes):
+    await thread.send(message, file=discord.File(BytesIO(file), filename=name), silent=True)
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
@@ -143,7 +143,7 @@ async def run_submission(  # noqa: C901
             # Update profile artifact to the actual download URL.
             # For the GitHub launcher the profile_artifact currently just contains
             # the name of the artifact.
-            if profile_res is not None:
+            if profile_res is not None and "profile-data" in index:
                 profile_res.download_url = index["profile-data"].public_download_url
 
             res = EvalResult(