Skip to content

Commit 394e234

Browse files
committed
profile each benchmark individually for cleaner traces
1 parent d754094 commit 394e234

File tree

2 files changed

+43
-34
lines changed

2 files changed

+43
-34
lines changed

src/libkernelbot/report.py

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -174,16 +174,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n
174174
elif full:
175175
result.append("❌ Benchmarks missing")
176176

177-
if "profile" in runs:
178-
bench_run = runs["profile"].run
179-
if not bench_run.success:
180-
result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
181-
return result
182-
elif not bench_run.passed:
183-
result.append("❌ Profiling failed")
184-
return result
185-
else:
186-
result.append("✅ Profiling successful")
177+
profile_runs = [v for k, v in runs.items() if k.startswith("profile")]
178+
if len(profile_runs) > 0:
179+
for prof_run in profile_runs:
180+
bench_run = prof_run.run
181+
if not bench_run.success:
182+
result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
183+
return result
184+
elif not bench_run.passed:
185+
result.append("❌ Profiling failed")
186+
return result
187+
else:
188+
result.append("✅ Profiling successful")
187189

188190
if "leaderboard" in runs:
189191
lb_run = runs["leaderboard"].run
@@ -327,23 +329,24 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901
327329
make_benchmark_log(bench_run.run),
328330
)
329331

330-
if "profile" in runs:
331-
prof_run = runs["profile"]
332-
if _handle_crash_report(report, prof_run):
333-
return report
332+
profile_runs = [v for k, v in runs.items() if k.startswith("profile")]
333+
if len(profile_runs) > 0:
334+
for prof_run in profile_runs:
335+
if _handle_crash_report(report, prof_run):
336+
return report
334337

335-
report.add_log(
336-
"Profiling",
337-
make_profile_log(prof_run.run),
338-
)
339-
340-
if prof_run.profile is not None and prof_run.profile.download_url is not None:
341-
report.add_link(
342-
f"{prof_run.profile.profiler} profiling output",
343-
"Download from GitHub",
344-
prof_run.profile.download_url,
338+
report.add_log(
339+
"Profiling",
340+
make_profile_log(prof_run.run),
345341
)
346342

343+
if prof_run.profile is not None and prof_run.profile.download_url is not None:
344+
report.add_link(
345+
f"{prof_run.profile.profiler} profiling output",
346+
"Download from GitHub",
347+
prof_run.profile.download_url,
348+
)
349+
347350
if "leaderboard" in runs:
348351
bench_run = runs["leaderboard"]
349352
if _handle_crash_report(report, bench_run):

src/libkernelbot/run_eval.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import copy
12
import dataclasses
23
import datetime
34
import functools
@@ -678,12 +679,13 @@ def run_pytorch_script( # noqa: C901
678679

679680

680681
class _EvalRunner(Protocol):
681-
def __call__(self, mode: str) -> EvalResult: ...
682+
def __call__(self, mode: str, **kwargs) -> EvalResult: ...
682683

683684

684685
def run_evaluation(
685686
call: _EvalRunner,
686687
mode: str,
688+
common_args: dict,
687689
) -> dict[str, EvalResult]:
688690
"""
689691
Given a "runner" function `call`, interprets the mode
@@ -693,22 +695,28 @@ def run_evaluation(
693695
require multiple runner calls.
694696
"""
695697
results: dict[str, EvalResult] = {}
696-
if mode in ["test", "benchmark", "profile"]:
697-
results[mode] = call(mode=mode)
698+
if mode == "profile":
699+
benchmarks = copy.deepcopy(common_args["benchmarks"])
700+
for i, benchmark in enumerate(benchmarks.splitlines()):
701+
common_args["benchmarks"] = benchmark
702+
results[f"{mode}.{i}"] = call(mode=mode, **common_args)
703+
704+
elif mode in ["test", "benchmark"]:
705+
results[mode] = call(mode=mode, **common_args)
698706
elif mode in ["private", "leaderboard"]:
699707
# first, run the tests
700-
results["test"] = call(mode="test")
708+
results["test"] = call(mode="test", **common_args)
701709

702710
if not results["test"].run or not results["test"].run.passed:
703711
return results
704712

705-
results["benchmark"] = call(mode="benchmark")
713+
results["benchmark"] = call(mode="benchmark", **common_args)
706714

707715
if not results["benchmark"].run or not results["benchmark"].run.passed:
708716
return results
709717

710718
# if they pass, run the leaderboard validation
711-
results["leaderboard"] = call(mode="leaderboard")
719+
results["leaderboard"] = call(mode="leaderboard", **common_args)
712720
else:
713721
raise AssertionError("Invalid mode")
714722

@@ -742,8 +750,7 @@ def run_config(config: dict):
742750
runner = functools.partial(
743751
run_pytorch_script,
744752
sources=config["sources"],
745-
main=config["main"],
746-
**common_args,
753+
main=config["main"]
747754
)
748755
elif config["lang"] == "cu":
749756
runner = functools.partial(
@@ -755,10 +762,9 @@ def run_config(config: dict):
755762
include_dirs=config.get("include_dirs", []),
756763
libraries=config.get("libraries", []),
757764
flags=CUDA_FLAGS,
758-
**common_args,
759765
)
760766
else:
761767
raise ValueError(f"Invalid language {config['lang']}")
762768

763-
results = run_evaluation(runner, config["mode"])
769+
results = run_evaluation(runner, config["mode"], common_args)
764770
return FullResult(success=True, error="", runs=results, system=system)

0 commit comments

Comments
 (0)