diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index 13841d27..a619f8e1 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -6,27 +6,15 @@ on:
     - cron: '0 2 * * *'
   workflow_dispatch:
   push:
-    branches: [main]
 
 jobs:
   health-check:
-    runs-on: [gpumode-nvidia-arc]
+    runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 5
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     
     steps:
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-    
-    - name: Install PyTorch
-      run: |
-        pip install torch
-    
     - name: GPU Health Check
-      run: python -c "import torch; torch.randn(5, device='cuda')"
+      run: python3 -c "import torch; torch.randn(5, device='cuda')"
     
     env:
       CUDA_VISIBLE_DEVICES: 0
diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index b50ec044..6f455fbe 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -19,22 +19,15 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}'
 
 jobs:
   run:
-    runs-on: [gpumode-nvidia-arc]
+    runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 10
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
     - uses: actions/checkout@v3
 
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-
-    - name: Install uv
-      uses: astral-sh/setup-uv@v3
-      with:
-        version: "latest"
+    - name: nvidia-smi
+      shell: bash
+      run: |
+        nvidia-smi || echo "nvidia-smi failed"
 
     - name: Create input files
       shell: bash
@@ -49,30 +42,18 @@ jobs:
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
 
-    - name: Install uv
-      uses: astral-sh/setup-uv@v3
-      with:
-        version: "latest"
-
-    - name: Setup Python environment
+    - name: Setup Virtual Environment and Install Dependencies
       shell: bash
       run: |
-        uv venv .venv
-        echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
-        echo "$PWD/.venv/bin" >> $GITHUB_PATH
+        pip install --upgrade pip
+        pip install -r "requirements.txt"
+        pip install -e .
 
-        if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
-          cat > "requirements.txt" <<'EOL'
-          ${{ github.event.inputs.requirements }}
-        EOL
-        uv pip install -r "requirements.txt"
-        fi
-        uv pip install -e .
 
     - name: Run script
       shell: bash
       run: |
-        python src/runners/github-runner.py
+        python3 src/runners/github-runner.py
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4
@@ -88,5 +69,3 @@ jobs:
         name: profile-data
         path: profile_data/*
         retention-days: 1
-    env:
-      CUDA_VISIBLE_DEVICES: 0
diff --git a/examples/eval.py b/examples/eval.py
index 597b5ff4..187e11cd 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
         return 112
 
 
-def _run_single_profile(test: TestCase) -> str:
+def _run_single_profile_torch(test: TestCase) -> str:
     """
-    Runs a single test case. Do not call directly
+    Profiles a single benchmark using the torch profiler.
     """
     from submission import custom_kernel
     from torch.profiler import profile, ProfilerActivity
@@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str:
         data = generate_input(**test.args)
         torch.cuda.synchronize()
 
+    cloned = _clone_data(data, 0)
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
         with nvtx_range("custom_kernel"):
-            submission_output = custom_kernel(_clone_data(data, 0))
+            submission_output = custom_kernel(cloned)
             torch.cuda.synchronize()
 
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
 
+def _run_single_profile_ncu(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using ncu. Note: this does not
+    invoke NCU; instead, it is expected that eval is launched
+    under NCU, and this function will rurnthe kernel excactly
+    once in the 'custom_kernel' nvtx range.
+    """
+    from submission import custom_kernel
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data, 0)
+    with nvtx_range("custom_kernel"):
+        submission_output = custom_kernel(cloned)
+        torch.cuda.synchronize()
+
+    return ""
+
+
 def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
     """
     Runs a single profiling case. Do not call directly
@@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
     """
     world_size = test.args.get("world_size", None)
     if world_size is None:
-        return pool.apply(_run_single_profile, (test,))
+        if bool(os.getenv("POPCORN_NCU", "0")):
+            return pool.apply(_run_single_profile_ncu, (test,))
+        else:
+            return pool.apply(_run_single_profile_torch, (test,))
     else:
         return run_multi_gpu_profile(pool, test, world_size)
 
diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
index c3fa893c..de1f5fbe 100644
--- a/scripts/ci_test_cuda.py
+++ b/scripts/ci_test_cuda.py
@@ -19,12 +19,12 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
         headers = header_files
 
     eval_result = run_cuda_script(
-        make_system_info(),
         sources,
         headers,
         arch=arch,
         mode=SubmissionMode.TEST.value,
         tests="size: 256; seed: 42\n",
+        system=make_system_info(),
         **kwargs,
     )
     return eval_result.compilation, eval_result.run
@@ -195,12 +195,12 @@ def test_include_dirs(tmp_path: Path):
 
     # can also use generic flags argument
     result = run_cuda_script(
-        make_system_info(),
         {"eval.cu": eval_cu, "submission.cu": sub},
         header_files,
         flags=["-I.", f"-I{tmp_path}"],
         mode=SubmissionMode.TEST.value,
         tests="size: 256; seed: 42\n",
+        system=make_system_info(),
     )
 
     assert result.compilation.success is True
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
index 7cc4fedd..1bd8dd9f 100644
--- a/scripts/ci_test_python.py
+++ b/scripts/ci_test_python.py
@@ -12,11 +12,11 @@
 
 def run_pytorch_helper(sources: dict, tests=None, **kwargs):
     result = run_pytorch_script(
-        make_system_info(),
         sources,
         "eval.py",
         mode=SubmissionMode.TEST.value,
         tests=tests or "size: 256; seed: 42\n",
+        system=make_system_info(),
         **kwargs,
     )
     return result.run
@@ -45,7 +45,7 @@ def custom_kernel(input):
     run = run_pytorch_helper({**files, "submission.py": sub})
     assert run.success is True
     assert run.passed is False
-    assert "python eval.py test" in run.command
+    assert "python3 eval.py test" in run.command
     assert run.stdout == ""
     assert run.stderr == ""
 
diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py
index 3b6fd8c3..d0551b07 100644
--- a/src/kernelbot/discord_reporter.py
+++ b/src/kernelbot/discord_reporter.py
@@ -1,7 +1,8 @@
 import discord
-from discord_utils import _send_split_log
+from discord_utils import _send_file, _send_split_log
 
 from libkernelbot.report import (
+    File,
     Link,
     Log,
     MultiProgressReporter,
@@ -70,6 +71,11 @@ async def display_report(self, title: str, report: RunResultReport):
                 message += part.text
             elif isinstance(part, Log):
                 message = await _send_split_log(thread, message, part.header, part.content)
+            elif isinstance(part, File):
+                if len(message) > 0:
+                    await thread.send(message)
+                await _send_file(thread, part.message, part.name, part.content)
+                message = ""
             elif isinstance(part, Link):
                 if len(message) > 0:
                     await thread.send(message)
diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py
index d014f3ca..7924a3d2 100644
--- a/src/kernelbot/discord_utils.py
+++ b/src/kernelbot/discord_utils.py
@@ -1,5 +1,6 @@
 import functools
 import logging
+from io import BytesIO
 
 import discord
 
@@ -124,7 +125,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
             else:
                 if partial_message != "":
                     chunks.append(partial_message)
-                partial_message = line
+                partial_message = line + "\n"
 
         if partial_message != "":
             chunks.append(partial_message)
@@ -133,6 +134,10 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
         for i, chunk in enumerate(chunks):
             partial_message = f"\n\n## {header} ({i+1}/{len(chunks)}):\n"
             partial_message += f"```\n{limit_length(chunk, 1900)}```"
-            await thread.send(partial_message)
+            await thread.send(partial_message, silent=True)
 
         return ""
+
+
+async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes):
+    await thread.send(message, file=discord.File(BytesIO(file), filename=name), silent=True)
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index 4c1b1d5f..3f09b94d 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -143,7 +143,7 @@ async def run_submission(  # noqa: C901
             # Update profile artifact to the actual download URL.
             # For the GitHub launcher the profile_artifact currently just contains
             # the name of the artifact.
-            if profile_res is not None:
+            if profile_res is not None and "profile-data" in index:
                 profile_res.download_url = index["profile-data"].public_download_url
 
             res = EvalResult(
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index 25bb27cb..58beaffe 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -43,9 +43,19 @@ class Link:
     url: str
 
 
+@dataclasses.dataclass
+class File:
+    """
+    Link represents a file that gets attached to the report.
+    """
+    name: str
+    message: str
+    content: bytes
+
+
 class RunResultReport:
     def __init__(self, data=None):
-        self.data: List[Text | Log | Link] = data or []
+        self.data: List[Text | Log | Link | File] = data or []
 
     def add_text(self, section: str):
         self.data.append(Text(section))
@@ -56,6 +66,9 @@ def add_log(self, header: str, log: str):
     def add_link(self, title: str, text: str, url: str):
         self.data.append(Link(title, text, url))
 
+    def add_file(self, name: str, message: str, content: bytes):
+        self.data.append(File(name, message, content))
+
     def __repr__(self):
         return f"RunResultReport(data={self.data})"
 
@@ -174,16 +187,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     elif full:
         result.append("❌ Benchmarks missing")
 
-    if "profile" in runs:
-        bench_run = runs["profile"].run
-        if not bench_run.success:
-            result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
-            return result
-        elif not bench_run.passed:
-            result.append("❌ Profiling failed")
-            return result
-        else:
-            result.append("✅ Profiling successful")
+    profile_runs = [v for k, v in runs.items() if k.startswith("profile")]
+    if len(profile_runs) > 0:
+        for prof_run in profile_runs:
+            bench_run = prof_run.run
+            if not bench_run.success:
+                result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
+                return result
+            elif not bench_run.passed:
+                result.append("❌ Profiling failed")
+                return result
+            else:
+                result.append("✅ Profiling successful")
 
     if "leaderboard" in runs:
         lb_run = runs["leaderboard"].run
@@ -257,12 +272,9 @@ def make_profile_log(run: RunResult) -> str:
     num_bench = int(run.result.get("benchmark-count", 0))
 
     def log_one(base_name):
-        spec = run.result.get(f"{base_name}.spec")
-
         report: str = run.result.get(f"{base_name}.report")
         report = base64.b64decode(report.encode("utf-8"), b"+*").decode("utf-8")
         report = textwrap.indent(report, "  ")
-        bench_log.append(f"{spec}\n")
         bench_log.append(report)
 
     bench_log = []
@@ -299,6 +311,10 @@ def _handle_crash_report(report: RunResultReport, run_result: EvalResult):
     return False
 
 
+def _shortname(spec: str):
+    return spec.replace(": ", "=").replace("; ", "_")
+
+
 def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
     runs = result.runs
     report = RunResultReport()
@@ -327,22 +343,33 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             make_benchmark_log(bench_run.run),
         )
 
-    if "profile" in runs:
-        prof_run = runs["profile"]
-        if _handle_crash_report(report, prof_run):
-            return report
-
-        report.add_log(
-            "Profiling",
-            make_profile_log(prof_run.run),
-        )
-
-        if prof_run.profile is not None and prof_run.profile.download_url is not None:
-            report.add_link(
-                f"{prof_run.profile.profiler} profiling output",
-                "Download from GitHub",
-                prof_run.profile.download_url,
-            )
+    profile_runs = [v for k, v in runs.items() if k.startswith("profile")]
+    if len(profile_runs) > 0:
+        for prof_run in profile_runs:
+            if _handle_crash_report(report, prof_run):
+                return report
+
+            if prof_run.profile.trace is not None:
+                report.add_log(
+                    f"Profiling {prof_run.run.result.get('benchmark.0.spec')}",
+                    make_profile_log(prof_run.run),
+                )
+
+                if prof_run.profile.download_url is not None:
+                    report.add_link(
+                        f"{prof_run.profile.profiler} profiling output",
+                        "Download from GitHub",
+                        prof_run.profile.download_url,
+                    )
+
+        for prof_run in profile_runs:
+            if prof_run.profile is not None:
+                if prof_run.profile.trace is not None:
+                    report.add_file(
+                        f"profile-{_shortname(prof_run.run.result.get('benchmark.0.spec'))}.zip",
+                        f"{prof_run.profile.profiler} report - " + prof_run.run.result.get("benchmark.0.spec"),
+                        base64.b64decode(prof_run.profile.trace),
+                    )
 
     if "leaderboard" in runs:
         bench_run = runs["leaderboard"]
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index c0897baf..e3879ee7 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -1,3 +1,5 @@
+import base64
+import copy
 import dataclasses
 import datetime
 import functools
@@ -19,10 +21,13 @@
 class ProfileResult:
     # fmt: off
     profiler: str      # The profiler used to gather this data
+    # Profiler trace. May be empty, in which case `download_url`
+    # should point to the trace file.
+    trace: str
     # Public download URL of all files created by the profiler
     # This may also be configured later
     download_url: Optional[str]
-    #fmt: on
+    # fmt: on
 
 
 @dataclasses.dataclass
@@ -122,6 +127,47 @@ def _create_files(files: Optional[dict[str, str]]):
         Path(name).write_text(content)
 
 
+def _directory_to_zip_bytes(directory_path) -> str:
+    """Create a zip archive and return as base64 encoded bytes."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        archive_path = os.path.join(temp_dir, 'archive')
+        shutil.make_archive(archive_path, 'zip', directory_path)
+
+        with open(archive_path + '.zip', 'rb') as f:
+            data = f.read()
+
+        return base64.b64encode(data).decode('utf-8')
+
+
+def _filter_ncu_report(report: str, tables: list):
+    """
+    Extract the Speed-of-light section from the full ncu terminal report.
+
+    For expert users, we just attach the full ncu profile to the result,
+    and they can view whichever metrics they are interested in. But to
+    encourage novice users to try out profiling, we want to have a
+    *simple* set of things to display automatically, short enough to fit
+    in a *single* discord message.
+    """
+    result = ""
+    collect = False
+    for line in report.splitlines():
+        if "Table Name : " in line:
+            table = line[line.find("Table Name :") + len("Table Name :"):].strip()
+            if table in tables:
+                result += "\n"
+                collect = True
+            else:
+                collect = False
+
+        if len(line.strip()) == 0:
+            collect = False
+
+        if collect:
+            result += line + "\n"
+    return result
+
+
 def compile_cuda_script(  # # noqa: C901
     files: list[str],
     arch: Optional[int] = None,
@@ -305,6 +351,122 @@ def run_program(
     )
 
 
+def profile_program_roc(
+    call: list[str],
+    seed: Optional[int],
+    timeout: int,
+    multi_gpu: bool,
+    output_dir: Path,
+) -> tuple[RunResult, Optional[ProfileResult]]:
+    # Wrap program in rocprof
+    call = [
+        "rocprofv3",
+        "--log-level",
+        "fatal",
+        "--hip-trace",
+        "--kernel-trace",
+        "--rccl-trace",
+        "--marker-trace",
+        "--hip-trace",
+        "--memory-copy-trace",
+        # New? Doesn't work in the runner
+        # "--memory-allocation-trace",
+        "--scratch-memory-trace",
+        # The HSA trace output is very large, so skip it for now
+        # "--hsa-trace",
+        "--output-format",
+        "pftrace",
+        "csv",
+        "-d",
+        str(output_dir),
+        # Just store the files as %pid%_tracename.ext instead of putting them in an
+        # additional directory named after the hostname.
+        "-o",
+        # Insert an extra path here so that the resulting zip has all files
+        # in the profile_data/ directory rather than the root.
+        "%pid%",
+        "--",
+    ] + call
+
+    run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
+        "GPU_DUMP_CODE_OBJECT": "1",
+    },
+        )
+
+    profile_result = None
+
+    if run_result.success:
+        # Post-process trace data.
+        # rocPROF generates one trace for every process, but its more useful to
+        # have all traces be in the same file. Fortunately we can do that by
+        # concatenating.
+        traces = list(output_dir.glob("*.pftrace"))
+        with (output_dir / "combined.pftrace").open("wb") as combined:
+            for trace_path in traces:
+                with trace_path.open("rb") as trace:
+                    shutil.copyfileobj(trace, combined)
+
+                # After we've created the combined trace, there is no point in
+                # keeping the individual traces around.
+                trace_path.unlink()
+
+        # Also move the code objects to the profiling output directory.
+        for code_obj in list(Path.cwd().glob("_code_object*.o")):
+            code_obj.rename(output_dir / code_obj.name)
+
+        profile_result = ProfileResult(
+            profiler="rocPROF",
+            trace=_directory_to_zip_bytes(output_dir),
+            download_url=None,
+        )
+
+    return run_result, profile_result
+
+
+def profile_program_ncu(
+    call: list[str],
+    seed: Optional[int],
+    timeout: int,
+    multi_gpu: bool,
+    output_dir: Path,
+) -> tuple[RunResult, Optional[ProfileResult]]:
+    assert not multi_gpu, "Multi-GPU profiling not supported for ncu."
+
+    # Wrap program in ncu
+    call = [
+        "ncu",
+        "--set", "full",
+        "--nvtx",
+        "--nvtx-include", "custom_kernel/",
+        "--import-source", "1",
+        "-o", f"{str(output_dir / 'profile.ncu-rep')}",
+        "--",
+    ] + call
+
+    run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
+        "POPCORN_NCU": "1"
+    })
+    profile_result = None
+
+    try:
+        get_tables = ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"]
+        ncu_cmd = ["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"]
+        report = subprocess.check_output(ncu_cmd, text=True)
+        report = _filter_ncu_report(report, get_tables)
+        run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8")
+    except subprocess.CalledProcessError:
+        pass
+
+    if run_result.success:
+        profile_result = ProfileResult(
+            profiler='Nsight-Compute',
+            trace=_directory_to_zip_bytes(output_dir),
+            download_url=None,
+        )
+
+    return run_result, profile_result
+
+
 def profile_program(
     system: SystemInfo,
     call: list[str],
@@ -315,82 +477,25 @@ def profile_program(
     # The runner-specific configuration should implement logic
     # to fetch the data in this directory and return it as
     # ProfileResult.download_url.
-    # Insert an extra nested nested path here so that the resulting zip has all files
+    # Insert an extra nested path here so that the resulting zip has all files
     # in the profile_data/ directory rather than directly in the root.
-    output_dir = Path(".") / "profile_data" / "profile_data"
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    if system.runtime == "ROCm":
-        # Wrap program in rocprof
-        call = [
-            "rocprofv3",
-            "--log-level",
-            "fatal",
-            "--hip-trace",
-            "--kernel-trace",
-            "--rccl-trace",
-            "--marker-trace",
-            "--hip-trace",
-            "--memory-copy-trace",
-            # New? Doesn't work in the runner
-            # "--memory-allocation-trace",
-            "--scratch-memory-trace",
-            # The HSA trace output is very large, so skip it for now
-            # "--hsa-trace",
-            "--output-format",
-            "pftrace",
-            "csv",
-            "-d",
-            str(output_dir),
-            # Just store the files as %pid%_tracename.ext instead of putting them in an
-            # additional directory named after the hostname.
-            "-o",
-            # Insert an extra path here so that the resulting zip has all files
-            # in the profile_data/ directory rather than the root.
-            "%pid%",
-            "--",
-        ] + call
-
-        run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
-            "GPU_DUMP_CODE_OBJECT": "1",
-        })
-
-        profile_result = None
-
-        if run_result.success:
-            # Post-process trace data.
-            # rocPROF generates one trace for every process, but its more useful to
-            # have all traces be in the same file. Fortunately we can do that by
-            # concatenating.
-            traces = list(output_dir.glob("*.pftrace"))
-            with (output_dir / "combined.pftrace").open("wb") as combined:
-                for trace_path in traces:
-                    with trace_path.open("rb") as trace:
-                        shutil.copyfileobj(trace, combined)
-
-                    # After we've created the combined trace, there is no point in
-                    # keeping the individual traces around.
-                    trace_path.unlink()
-
-            # Also move the code objects to the profiling output directory.
-            for code_obj in list(Path.cwd().glob("_code_object*.o")):
-                code_obj.rename(output_dir / code_obj.name)
-
-            profile_result = ProfileResult(
-                profiler='rocPROF',
-                download_url=None,
-            )
 
-        return run_result, profile_result
-    else:
-        # TODO: Implement profiling for other platforms
-        return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
+    with tempfile.TemporaryDirectory(dir=".") as tmpdir:
+        output_dir = Path(tmpdir) / "profile_data"
+        output_dir.mkdir()
+        if system.runtime == "ROCm":
+            return profile_program_roc(call, seed, timeout, multi_gpu, output_dir)
+        elif system.runtime == "CUDA":
+            return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir)
+        else:
+            raise ValueError(f"Unknown runtime {system.runtime}")
+
 
 def run_single_evaluation(
-    system: SystemInfo,
     call: list[str],
     mode: str,
     *,
+    system: SystemInfo,
     multi_gpu: bool = False,
     tests: Optional[str] = None,
     benchmarks: Optional[str] = None,
@@ -419,7 +524,7 @@ def run_single_evaluation(
 
         cases.flush()
 
-        call += [mode, cases.name]
+        call = call + [mode, cases.name]
 
         if mode == "profile":
             return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
@@ -427,7 +532,7 @@ def run_single_evaluation(
         return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
 
 
-def make_system_info() -> SystemInfo: # noqa: C901
+def make_system_info() -> SystemInfo:  # noqa: C901
     info = SystemInfo()
     try:
         import torch
@@ -448,14 +553,16 @@ def make_system_info() -> SystemInfo: # noqa: C901
             info.gpu = subprocess.check_output(
                 ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8"
             )
-            info.device_count = info.gpu.count('\n')
+            info.device_count = info.gpu.count("\n")
             info.runtime = "CUDA"
         except subprocess.CalledProcessError:
             # try again for HIP
             try:
-                rocm_info = json.loads(subprocess.check_output(
-                    ["rocm-smi", "--showproductname", "--json"], encoding="utf-8"
-                ))
+                rocm_info = json.loads(
+                    subprocess.check_output(
+                        ["rocm-smi", "--showproductname", "--json"], encoding="utf-8"
+                    )
+                )
                 if len(rocm_info) > 0:
                     info.gpu = next(rocm_info.__iter__())["Card Series"]
 
@@ -489,7 +596,6 @@ def make_system_info() -> SystemInfo: # noqa: C901
 
 
 def run_cuda_script(  # # noqa: C901
-    system: SystemInfo,
     sources: dict[str, str],
     headers: Optional[dict[str, str]] = None,
     arch: Optional[int] = None,
@@ -550,7 +656,7 @@ def run_cuda_script(  # # noqa: C901
             if os.path.exists(f):
                 os.remove(f)
 
-    run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs)
+    run_result, profile_result = run_single_evaluation(["./eval.out"], **kwargs)
     return EvalResult(
         start=start,
         end=datetime.datetime.now(),
@@ -561,7 +667,6 @@ def run_cuda_script(  # # noqa: C901
 
 
 def run_pytorch_script(  # noqa: C901
-    system: SystemInfo,
     sources: dict[str, str],
     main: str,
     **kwargs,
@@ -587,7 +692,7 @@ def run_pytorch_script(  # noqa: C901
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
         try:
-            compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE)
+            compile_run = run_program(["python3", "submission.py"], seed=1, timeout=Timeout.COMPILE)
             if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
                 comp = CompileResult(
                     nvcc_found=True,
@@ -613,7 +718,7 @@ def run_pytorch_script(  # noqa: C901
                 exit_code=e.returncode,
             )
 
-        run, profile = run_single_evaluation(system, ["python", main], **kwargs)
+        run, profile = run_single_evaluation(["python3", main], **kwargs)
 
         return EvalResult(
             start=start,
@@ -629,12 +734,13 @@ def run_pytorch_script(  # noqa: C901
 
 
 class _EvalRunner(Protocol):
-    def __call__(self, mode: str) -> EvalResult: ...
+    def __call__(self, mode: str, **kwargs) -> EvalResult: ...
 
 
 def run_evaluation(
     call: _EvalRunner,
     mode: str,
+    common_args: dict,
 ) -> dict[str, EvalResult]:
     """
     Given a "runner" function `call`, interprets the mode
@@ -644,22 +750,28 @@ def run_evaluation(
     require multiple runner calls.
     """
     results: dict[str, EvalResult] = {}
-    if mode in ["test", "benchmark", "profile"]:
-        results[mode] = call(mode=mode)
+    if mode == "profile":
+        benchmarks = copy.deepcopy(common_args["benchmarks"])
+        for i, benchmark in enumerate(benchmarks.splitlines()):
+            common_args["benchmarks"] = benchmark
+            results[f"{mode}.{i}"] = call(mode=mode, **common_args)
+
+    elif mode in ["test", "benchmark"]:
+        results[mode] = call(mode=mode, **common_args)
     elif mode in ["private", "leaderboard"]:
         # first, run the tests
-        results["test"] = call(mode="test")
+        results["test"] = call(mode="test", **common_args)
 
         if not results["test"].run or not results["test"].run.passed:
             return results
 
-        results["benchmark"] = call(mode="benchmark")
+        results["benchmark"] = call(mode="benchmark", **common_args)
 
         if not results["benchmark"].run or not results["benchmark"].run.passed:
             return results
 
         # if they pass, run the leaderboard validation
-        results["leaderboard"] = call(mode="leaderboard")
+        results["leaderboard"] = call(mode="leaderboard", **common_args)
     else:
         raise AssertionError("Invalid mode")
 
@@ -693,8 +805,7 @@ def run_config(config: dict):
         runner = functools.partial(
             run_pytorch_script,
             sources=config["sources"],
-            main=config["main"],
-            **common_args,
+            main=config["main"]
         )
     elif config["lang"] == "cu":
         runner = functools.partial(
@@ -706,10 +817,9 @@ def run_config(config: dict):
             include_dirs=config.get("include_dirs", []),
             libraries=config.get("libraries", []),
             flags=CUDA_FLAGS,
-            **common_args,
         )
     else:
         raise ValueError(f"Invalid language {config['lang']}")
 
-    results = run_evaluation(runner, config["mode"])
+    results = run_evaluation(runner, config["mode"], common_args)
     return FullResult(success=True, error="", runs=results, system=system)
diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py
index 6a048e62..d2cb0d64 100644
--- a/src/runners/modal_runner.py
+++ b/src/runners/modal_runner.py
@@ -16,7 +16,7 @@
 
 # Move this to another file later:
 cuda_image = (
-    Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
+    Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13")
     .apt_install(
         "git",
         "gcc-13",
diff --git a/tests/test_report.py b/tests/test_report.py
index a1964e62..9006a98e 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -6,6 +6,7 @@
 
 from libkernelbot import consts
 from libkernelbot.report import (
+    File,
     RunResultReport,
     _generate_compile_report,
     _short_fail_reason,
@@ -402,7 +403,6 @@ def test_make_profile_log():
 
     log = make_profile_log(run)
 
-    assert "Matrix multiplication profile" in log
     assert "  Profile line 1" in log
     assert "  Profile line 2" in log
 
@@ -664,6 +664,7 @@ def test_generate_report_profile(sample_full_result: FullResult):
     }
     sample_full_result.runs["profile"].profile = ProfileResult(
         profiler="NSight",
+        trace="",
         download_url="https://example.com",
     )
     report = generate_report(sample_full_result)
@@ -687,8 +688,11 @@ def test_generate_report_profile(sample_full_result: FullResult):
             "❌ Test division\n"
             "> Division by zero",
         ),
-        Log(header="Profiling", content="Benchmark\n\n  Profile report\n"),
+        Log(header='Profiling Benchmark', content='  Profile report\n'),
         Link("NSight profiling output", "Download from GitHub", "https://example.com"),
+        File(name='profile-Benchmark.zip',
+             message='NSight report - Benchmark',
+             content=b''),
     ]