profile each benchmark individually for cleaner traces

ngc92 · ngc92 · commit 394e2341e096 · 2025-11-09T14:35:11.000+01:00
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
@@ -174,16 +174,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     elif full:
         result.append("❌ Benchmarks missing")
 
-    if "profile" in runs:
-        bench_run = runs["profile"].run
-        if not bench_run.success:
-            result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
-            return result
-        elif not bench_run.passed:
-            result.append("❌ Profiling failed")
-            return result
-        else:
-            result.append("✅ Profiling successful")
+    profile_runs = [v for k, v in runs.items() if k.startswith("profile")]
+    if len(profile_runs) > 0:
+        for prof_run in profile_runs:
+            bench_run = prof_run.run
+            if not bench_run.success:
+                result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
+                return result
+            elif not bench_run.passed:
+                result.append("❌ Profiling failed")
+                return result
+            else:
+                result.append("✅ Profiling successful")
 
     if "leaderboard" in runs:
         lb_run = runs["leaderboard"].run
@@ -327,23 +329,24 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             make_benchmark_log(bench_run.run),
         )
 
-    if "profile" in runs:
-        prof_run = runs["profile"]
-        if _handle_crash_report(report, prof_run):
-            return report
+    profile_runs = [v for k, v in runs.items() if k.startswith("profile")]
+    if len(profile_runs) > 0:
+        for prof_run in profile_runs:
+            if _handle_crash_report(report, prof_run):
+                return report
 
-        report.add_log(
-            "Profiling",
-            make_profile_log(prof_run.run),
-        )
-
-        if prof_run.profile is not None and prof_run.profile.download_url is not None:
-            report.add_link(
-                f"{prof_run.profile.profiler} profiling output",
-                "Download from GitHub",
-                prof_run.profile.download_url,
+            report.add_log(
+                "Profiling",
+                make_profile_log(prof_run.run),
             )
 
+            if prof_run.profile is not None and prof_run.profile.download_url is not None:
+                report.add_link(
+                    f"{prof_run.profile.profiler} profiling output",
+                    "Download from GitHub",
+                    prof_run.profile.download_url,
+                )
+
     if "leaderboard" in runs:
         bench_run = runs["leaderboard"]
         if _handle_crash_report(report, bench_run):
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
@@ -1,3 +1,4 @@
+import copy
 import dataclasses
 import datetime
 import functools
@@ -678,12 +679,13 @@ def run_pytorch_script(  # noqa: C901
 
 
 class _EvalRunner(Protocol):
-    def __call__(self, mode: str) -> EvalResult: ...
+    def __call__(self, mode: str, **kwargs) -> EvalResult: ...
 
 
 def run_evaluation(
     call: _EvalRunner,
     mode: str,
+    common_args: dict,
 ) -> dict[str, EvalResult]:
     """
     Given a "runner" function `call`, interprets the mode
@@ -693,22 +695,28 @@ def run_evaluation(
     require multiple runner calls.
     """
     results: dict[str, EvalResult] = {}
-    if mode in ["test", "benchmark", "profile"]:
-        results[mode] = call(mode=mode)
+    if mode == "profile":
+        benchmarks = copy.deepcopy(common_args["benchmarks"])
+        for i, benchmark in enumerate(benchmarks.splitlines()):
+            common_args["benchmarks"] = benchmark
+            results[f"{mode}.{i}"] = call(mode=mode, **common_args)
+
+    elif mode in ["test", "benchmark"]:
+        results[mode] = call(mode=mode, **common_args)
     elif mode in ["private", "leaderboard"]:
         # first, run the tests
-        results["test"] = call(mode="test")
+        results["test"] = call(mode="test", **common_args)
 
         if not results["test"].run or not results["test"].run.passed:
             return results
 
-        results["benchmark"] = call(mode="benchmark")
+        results["benchmark"] = call(mode="benchmark", **common_args)
 
         if not results["benchmark"].run or not results["benchmark"].run.passed:
             return results
 
         # if they pass, run the leaderboard validation
-        results["leaderboard"] = call(mode="leaderboard")
+        results["leaderboard"] = call(mode="leaderboard", **common_args)
     else:
         raise AssertionError("Invalid mode")
 
@@ -742,8 +750,7 @@ def run_config(config: dict):
         runner = functools.partial(
             run_pytorch_script,
             sources=config["sources"],
-            main=config["main"],
-            **common_args,
+            main=config["main"]
         )
     elif config["lang"] == "cu":
         runner = functools.partial(
@@ -755,10 +762,9 @@ def run_config(config: dict):
             include_dirs=config.get("include_dirs", []),
             libraries=config.get("libraries", []),
             flags=CUDA_FLAGS,
-            **common_args,
         )
     else:
         raise ValueError(f"Invalid language {config['lang']}")
 
-    results = run_evaluation(runner, config["mode"])
+    results = run_evaluation(runner, config["mode"], common_args)
     return FullResult(success=True, error="", runs=results, system=system)