diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 13841d27..a619f8e1 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -6,27 +6,15 @@ on: - cron: '0 2 * * *' workflow_dispatch: push: - branches: [main] jobs: health-check: - runs-on: [gpumode-nvidia-arc] + runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 5 - container: - image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install PyTorch - run: | - pip install torch - - name: GPU Health Check - run: python -c "import torch; torch.randn(5, device='cuda')" + run: python3 -c "import torch; torch.randn(5, device='cuda')" env: CUDA_VISIBLE_DEVICES: 0 diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index b50ec044..6f455fbe 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -19,22 +19,15 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}' jobs: run: - runs-on: [gpumode-nvidia-arc] + runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 10 - container: - image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" + - name: nvidia-smi + shell: bash + run: | + nvidia-smi || echo "nvidia-smi failed" - name: Create input files shell: bash @@ -49,30 +42,18 @@ jobs: # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" - - - name: Setup Python environment + - name: Setup Virtual Environment and Install Dependencies shell: bash run: | - uv venv .venv - echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV - echo "$PWD/.venv/bin" >> $GITHUB_PATH + pip install --upgrade pip + pip install -r "requirements.txt" + pip install -e . - if [[ -n "${{ github.event.inputs.requirements }}" ]]; then - cat > "requirements.txt" <<'EOL' - ${{ github.event.inputs.requirements }} - EOL - uv pip install -r "requirements.txt" - fi - uv pip install -e . - name: Run script shell: bash run: | - python src/runners/github-runner.py + python3 src/runners/github-runner.py - name: Upload training artifacts uses: actions/upload-artifact@v4 @@ -88,5 +69,3 @@ jobs: name: profile-data path: profile_data/* retention-days: 1 - env: - CUDA_VISIBLE_DEVICES: 0 diff --git a/examples/eval.py b/examples/eval.py index 597b5ff4..187e11cd 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l return 112 -def _run_single_profile(test: TestCase) -> str: +def _run_single_profile_torch(test: TestCase) -> str: """ - Runs a single test case. Do not call directly + Profiles a single benchmark using the torch profiler. """ from submission import custom_kernel from torch.profiler import profile, ProfilerActivity @@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str: data = generate_input(**test.args) torch.cuda.synchronize() + cloned = _clone_data(data, 0) with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: with nvtx_range("custom_kernel"): - submission_output = custom_kernel(_clone_data(data, 0)) + submission_output = custom_kernel(cloned) torch.cuda.synchronize() return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20) +def _run_single_profile_ncu(test: TestCase) -> str: + """ + Profiles a single benchmark using ncu. Note: this does not + invoke NCU; instead, it is expected that eval is launched + under NCU, and this function will rurnthe kernel excactly + once in the 'custom_kernel' nvtx range. + """ + from submission import custom_kernel + + with nvtx_range("generate input"): + data = generate_input(**test.args) + torch.cuda.synchronize() + + cloned = _clone_data(data, 0) + with nvtx_range("custom_kernel"): + submission_output = custom_kernel(cloned) + torch.cuda.synchronize() + + return "" + + def _run_distributed_profile(test: TestCase, rank: int) -> "EventList": """ Runs a single profiling case. Do not call directly @@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str: """ world_size = test.args.get("world_size", None) if world_size is None: - return pool.apply(_run_single_profile, (test,)) + if bool(os.getenv("POPCORN_NCU", "0")): + return pool.apply(_run_single_profile_ncu, (test,)) + else: + return pool.apply(_run_single_profile_torch, (test,)) else: return run_multi_gpu_profile(pool, test, world_size) diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py index c3fa893c..de1f5fbe 100644 --- a/scripts/ci_test_cuda.py +++ b/scripts/ci_test_cuda.py @@ -19,12 +19,12 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs): headers = header_files eval_result = run_cuda_script( - make_system_info(), sources, headers, arch=arch, mode=SubmissionMode.TEST.value, tests="size: 256; seed: 42\n", + system=make_system_info(), **kwargs, ) return eval_result.compilation, eval_result.run @@ -195,12 +195,12 @@ def test_include_dirs(tmp_path: Path): # can also use generic flags argument result = run_cuda_script( - make_system_info(), {"eval.cu": eval_cu, "submission.cu": sub}, header_files, flags=["-I.", f"-I{tmp_path}"], mode=SubmissionMode.TEST.value, tests="size: 256; seed: 42\n", + system=make_system_info(), ) assert result.compilation.success is True diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py index 7cc4fedd..1bd8dd9f 100644 --- a/scripts/ci_test_python.py +++ b/scripts/ci_test_python.py @@ -12,11 +12,11 @@ def run_pytorch_helper(sources: dict, tests=None, **kwargs): result = run_pytorch_script( - make_system_info(), sources, "eval.py", mode=SubmissionMode.TEST.value, tests=tests or "size: 256; seed: 42\n", + system=make_system_info(), **kwargs, ) return result.run @@ -45,7 +45,7 @@ def custom_kernel(input): run = run_pytorch_helper({**files, "submission.py": sub}) assert run.success is True assert run.passed is False - assert "python eval.py test" in run.command + assert "python3 eval.py test" in run.command assert run.stdout == "" assert run.stderr == "" diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py index 3b6fd8c3..d0551b07 100644 --- a/src/kernelbot/discord_reporter.py +++ b/src/kernelbot/discord_reporter.py @@ -1,7 +1,8 @@ import discord -from discord_utils import _send_split_log +from discord_utils import _send_file, _send_split_log from libkernelbot.report import ( + File, Link, Log, MultiProgressReporter, @@ -70,6 +71,11 @@ async def display_report(self, title: str, report: RunResultReport): message += part.text elif isinstance(part, Log): message = await _send_split_log(thread, message, part.header, part.content) + elif isinstance(part, File): + if len(message) > 0: + await thread.send(message) + await _send_file(thread, part.message, part.name, part.content) + message = "" elif isinstance(part, Link): if len(message) > 0: await thread.send(message) diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py index d014f3ca..7924a3d2 100644 --- a/src/kernelbot/discord_utils.py +++ b/src/kernelbot/discord_utils.py @@ -1,5 +1,6 @@ import functools import logging +from io import BytesIO import discord @@ -124,7 +125,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header: else: if partial_message != "": chunks.append(partial_message) - partial_message = line + partial_message = line + "\n" if partial_message != "": chunks.append(partial_message) @@ -133,6 +134,10 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header: for i, chunk in enumerate(chunks): partial_message = f"\n\n## {header} ({i+1}/{len(chunks)}):\n" partial_message += f"```\n{limit_length(chunk, 1900)}```" - await thread.send(partial_message) + await thread.send(partial_message, silent=True) return "" + + +async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes): + await thread.send(message, file=discord.File(BytesIO(file), filename=name), silent=True) diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index 4c1b1d5f..3f09b94d 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -143,7 +143,7 @@ async def run_submission( # noqa: C901 # Update profile artifact to the actual download URL. # For the GitHub launcher the profile_artifact currently just contains # the name of the artifact. - if profile_res is not None: + if profile_res is not None and "profile-data" in index: profile_res.download_url = index["profile-data"].public_download_url res = EvalResult( diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py index 25bb27cb..58beaffe 100644 --- a/src/libkernelbot/report.py +++ b/src/libkernelbot/report.py @@ -43,9 +43,19 @@ class Link: url: str +@dataclasses.dataclass +class File: + """ + Link represents a file that gets attached to the report. + """ + name: str + message: str + content: bytes + + class RunResultReport: def __init__(self, data=None): - self.data: List[Text | Log | Link] = data or [] + self.data: List[Text | Log | Link | File] = data or [] def add_text(self, section: str): self.data.append(Text(section)) @@ -56,6 +66,9 @@ def add_log(self, header: str, log: str): def add_link(self, title: str, text: str, url: str): self.data.append(Link(title, text, url)) + def add_file(self, name: str, message: str, content: bytes): + self.data.append(File(name, message, content)) + def __repr__(self): return f"RunResultReport(data={self.data})" @@ -174,16 +187,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n elif full: result.append("❌ Benchmarks missing") - if "profile" in runs: - bench_run = runs["profile"].run - if not bench_run.success: - result.append("❌ Running profile failed" + _short_fail_reason(bench_run)) - return result - elif not bench_run.passed: - result.append("❌ Profiling failed") - return result - else: - result.append("✅ Profiling successful") + profile_runs = [v for k, v in runs.items() if k.startswith("profile")] + if len(profile_runs) > 0: + for prof_run in profile_runs: + bench_run = prof_run.run + if not bench_run.success: + result.append("❌ Running profile failed" + _short_fail_reason(bench_run)) + return result + elif not bench_run.passed: + result.append("❌ Profiling failed") + return result + else: + result.append("✅ Profiling successful") if "leaderboard" in runs: lb_run = runs["leaderboard"].run @@ -257,12 +272,9 @@ def make_profile_log(run: RunResult) -> str: num_bench = int(run.result.get("benchmark-count", 0)) def log_one(base_name): - spec = run.result.get(f"{base_name}.spec") - report: str = run.result.get(f"{base_name}.report") report = base64.b64decode(report.encode("utf-8"), b"+*").decode("utf-8") report = textwrap.indent(report, " ") - bench_log.append(f"{spec}\n") bench_log.append(report) bench_log = [] @@ -299,6 +311,10 @@ def _handle_crash_report(report: RunResultReport, run_result: EvalResult): return False +def _shortname(spec: str): + return spec.replace(": ", "=").replace("; ", "_") + + def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 runs = result.runs report = RunResultReport() @@ -327,22 +343,33 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 make_benchmark_log(bench_run.run), ) - if "profile" in runs: - prof_run = runs["profile"] - if _handle_crash_report(report, prof_run): - return report - - report.add_log( - "Profiling", - make_profile_log(prof_run.run), - ) - - if prof_run.profile is not None and prof_run.profile.download_url is not None: - report.add_link( - f"{prof_run.profile.profiler} profiling output", - "Download from GitHub", - prof_run.profile.download_url, - ) + profile_runs = [v for k, v in runs.items() if k.startswith("profile")] + if len(profile_runs) > 0: + for prof_run in profile_runs: + if _handle_crash_report(report, prof_run): + return report + + if prof_run.profile.trace is not None: + report.add_log( + f"Profiling {prof_run.run.result.get('benchmark.0.spec')}", + make_profile_log(prof_run.run), + ) + + if prof_run.profile.download_url is not None: + report.add_link( + f"{prof_run.profile.profiler} profiling output", + "Download from GitHub", + prof_run.profile.download_url, + ) + + for prof_run in profile_runs: + if prof_run.profile is not None: + if prof_run.profile.trace is not None: + report.add_file( + f"profile-{_shortname(prof_run.run.result.get('benchmark.0.spec'))}.zip", + f"{prof_run.profile.profiler} report - " + prof_run.run.result.get("benchmark.0.spec"), + base64.b64decode(prof_run.profile.trace), + ) if "leaderboard" in runs: bench_run = runs["leaderboard"] diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index c0897baf..e3879ee7 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -1,3 +1,5 @@ +import base64 +import copy import dataclasses import datetime import functools @@ -19,10 +21,13 @@ class ProfileResult: # fmt: off profiler: str # The profiler used to gather this data + # Profiler trace. May be empty, in which case `download_url` + # should point to the trace file. + trace: str # Public download URL of all files created by the profiler # This may also be configured later download_url: Optional[str] - #fmt: on + # fmt: on @dataclasses.dataclass @@ -122,6 +127,47 @@ def _create_files(files: Optional[dict[str, str]]): Path(name).write_text(content) +def _directory_to_zip_bytes(directory_path) -> str: + """Create a zip archive and return as base64 encoded bytes.""" + with tempfile.TemporaryDirectory() as temp_dir: + archive_path = os.path.join(temp_dir, 'archive') + shutil.make_archive(archive_path, 'zip', directory_path) + + with open(archive_path + '.zip', 'rb') as f: + data = f.read() + + return base64.b64encode(data).decode('utf-8') + + +def _filter_ncu_report(report: str, tables: list): + """ + Extract the Speed-of-light section from the full ncu terminal report. + + For expert users, we just attach the full ncu profile to the result, + and they can view whichever metrics they are interested in. But to + encourage novice users to try out profiling, we want to have a + *simple* set of things to display automatically, short enough to fit + in a *single* discord message. + """ + result = "" + collect = False + for line in report.splitlines(): + if "Table Name : " in line: + table = line[line.find("Table Name :") + len("Table Name :"):].strip() + if table in tables: + result += "\n" + collect = True + else: + collect = False + + if len(line.strip()) == 0: + collect = False + + if collect: + result += line + "\n" + return result + + def compile_cuda_script( # # noqa: C901 files: list[str], arch: Optional[int] = None, @@ -305,6 +351,122 @@ def run_program( ) +def profile_program_roc( + call: list[str], + seed: Optional[int], + timeout: int, + multi_gpu: bool, + output_dir: Path, +) -> tuple[RunResult, Optional[ProfileResult]]: + # Wrap program in rocprof + call = [ + "rocprofv3", + "--log-level", + "fatal", + "--hip-trace", + "--kernel-trace", + "--rccl-trace", + "--marker-trace", + "--hip-trace", + "--memory-copy-trace", + # New? Doesn't work in the runner + # "--memory-allocation-trace", + "--scratch-memory-trace", + # The HSA trace output is very large, so skip it for now + # "--hsa-trace", + "--output-format", + "pftrace", + "csv", + "-d", + str(output_dir), + # Just store the files as %pid%_tracename.ext instead of putting them in an + # additional directory named after the hostname. + "-o", + # Insert an extra path here so that the resulting zip has all files + # in the profile_data/ directory rather than the root. + "%pid%", + "--", + ] + call + + run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ + "GPU_DUMP_CODE_OBJECT": "1", + }, + ) + + profile_result = None + + if run_result.success: + # Post-process trace data. + # rocPROF generates one trace for every process, but its more useful to + # have all traces be in the same file. Fortunately we can do that by + # concatenating. + traces = list(output_dir.glob("*.pftrace")) + with (output_dir / "combined.pftrace").open("wb") as combined: + for trace_path in traces: + with trace_path.open("rb") as trace: + shutil.copyfileobj(trace, combined) + + # After we've created the combined trace, there is no point in + # keeping the individual traces around. + trace_path.unlink() + + # Also move the code objects to the profiling output directory. + for code_obj in list(Path.cwd().glob("_code_object*.o")): + code_obj.rename(output_dir / code_obj.name) + + profile_result = ProfileResult( + profiler="rocPROF", + trace=_directory_to_zip_bytes(output_dir), + download_url=None, + ) + + return run_result, profile_result + + +def profile_program_ncu( + call: list[str], + seed: Optional[int], + timeout: int, + multi_gpu: bool, + output_dir: Path, +) -> tuple[RunResult, Optional[ProfileResult]]: + assert not multi_gpu, "Multi-GPU profiling not supported for ncu." + + # Wrap program in ncu + call = [ + "ncu", + "--set", "full", + "--nvtx", + "--nvtx-include", "custom_kernel/", + "--import-source", "1", + "-o", f"{str(output_dir / 'profile.ncu-rep')}", + "--", + ] + call + + run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ + "POPCORN_NCU": "1" + }) + profile_result = None + + try: + get_tables = ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"] + ncu_cmd = ["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"] + report = subprocess.check_output(ncu_cmd, text=True) + report = _filter_ncu_report(report, get_tables) + run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8") + except subprocess.CalledProcessError: + pass + + if run_result.success: + profile_result = ProfileResult( + profiler='Nsight-Compute', + trace=_directory_to_zip_bytes(output_dir), + download_url=None, + ) + + return run_result, profile_result + + def profile_program( system: SystemInfo, call: list[str], @@ -315,82 +477,25 @@ def profile_program( # The runner-specific configuration should implement logic # to fetch the data in this directory and return it as # ProfileResult.download_url. - # Insert an extra nested nested path here so that the resulting zip has all files + # Insert an extra nested path here so that the resulting zip has all files # in the profile_data/ directory rather than directly in the root. - output_dir = Path(".") / "profile_data" / "profile_data" - output_dir.mkdir(parents=True, exist_ok=True) - - if system.runtime == "ROCm": - # Wrap program in rocprof - call = [ - "rocprofv3", - "--log-level", - "fatal", - "--hip-trace", - "--kernel-trace", - "--rccl-trace", - "--marker-trace", - "--hip-trace", - "--memory-copy-trace", - # New? Doesn't work in the runner - # "--memory-allocation-trace", - "--scratch-memory-trace", - # The HSA trace output is very large, so skip it for now - # "--hsa-trace", - "--output-format", - "pftrace", - "csv", - "-d", - str(output_dir), - # Just store the files as %pid%_tracename.ext instead of putting them in an - # additional directory named after the hostname. - "-o", - # Insert an extra path here so that the resulting zip has all files - # in the profile_data/ directory rather than the root. - "%pid%", - "--", - ] + call - - run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ - "GPU_DUMP_CODE_OBJECT": "1", - }) - - profile_result = None - - if run_result.success: - # Post-process trace data. - # rocPROF generates one trace for every process, but its more useful to - # have all traces be in the same file. Fortunately we can do that by - # concatenating. - traces = list(output_dir.glob("*.pftrace")) - with (output_dir / "combined.pftrace").open("wb") as combined: - for trace_path in traces: - with trace_path.open("rb") as trace: - shutil.copyfileobj(trace, combined) - - # After we've created the combined trace, there is no point in - # keeping the individual traces around. - trace_path.unlink() - - # Also move the code objects to the profiling output directory. - for code_obj in list(Path.cwd().glob("_code_object*.o")): - code_obj.rename(output_dir / code_obj.name) - - profile_result = ProfileResult( - profiler='rocPROF', - download_url=None, - ) - return run_result, profile_result - else: - # TODO: Implement profiling for other platforms - return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None + with tempfile.TemporaryDirectory(dir=".") as tmpdir: + output_dir = Path(tmpdir) / "profile_data" + output_dir.mkdir() + if system.runtime == "ROCm": + return profile_program_roc(call, seed, timeout, multi_gpu, output_dir) + elif system.runtime == "CUDA": + return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir) + else: + raise ValueError(f"Unknown runtime {system.runtime}") + def run_single_evaluation( - system: SystemInfo, call: list[str], mode: str, *, + system: SystemInfo, multi_gpu: bool = False, tests: Optional[str] = None, benchmarks: Optional[str] = None, @@ -419,7 +524,7 @@ def run_single_evaluation( cases.flush() - call += [mode, cases.name] + call = call + [mode, cases.name] if mode == "profile": return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu) @@ -427,7 +532,7 @@ def run_single_evaluation( return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None -def make_system_info() -> SystemInfo: # noqa: C901 +def make_system_info() -> SystemInfo: # noqa: C901 info = SystemInfo() try: import torch @@ -448,14 +553,16 @@ def make_system_info() -> SystemInfo: # noqa: C901 info.gpu = subprocess.check_output( ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8" ) - info.device_count = info.gpu.count('\n') + info.device_count = info.gpu.count("\n") info.runtime = "CUDA" except subprocess.CalledProcessError: # try again for HIP try: - rocm_info = json.loads(subprocess.check_output( - ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" - )) + rocm_info = json.loads( + subprocess.check_output( + ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" + ) + ) if len(rocm_info) > 0: info.gpu = next(rocm_info.__iter__())["Card Series"] @@ -489,7 +596,6 @@ def make_system_info() -> SystemInfo: # noqa: C901 def run_cuda_script( # # noqa: C901 - system: SystemInfo, sources: dict[str, str], headers: Optional[dict[str, str]] = None, arch: Optional[int] = None, @@ -550,7 +656,7 @@ def run_cuda_script( # # noqa: C901 if os.path.exists(f): os.remove(f) - run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs) + run_result, profile_result = run_single_evaluation(["./eval.out"], **kwargs) return EvalResult( start=start, end=datetime.datetime.now(), @@ -561,7 +667,6 @@ def run_cuda_script( # # noqa: C901 def run_pytorch_script( # noqa: C901 - system: SystemInfo, sources: dict[str, str], main: str, **kwargs, @@ -587,7 +692,7 @@ def run_pytorch_script( # noqa: C901 # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. try: - compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE) + compile_run = run_program(["python3", "submission.py"], seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( nvcc_found=True, @@ -613,7 +718,7 @@ def run_pytorch_script( # noqa: C901 exit_code=e.returncode, ) - run, profile = run_single_evaluation(system, ["python", main], **kwargs) + run, profile = run_single_evaluation(["python3", main], **kwargs) return EvalResult( start=start, @@ -629,12 +734,13 @@ def run_pytorch_script( # noqa: C901 class _EvalRunner(Protocol): - def __call__(self, mode: str) -> EvalResult: ... + def __call__(self, mode: str, **kwargs) -> EvalResult: ... def run_evaluation( call: _EvalRunner, mode: str, + common_args: dict, ) -> dict[str, EvalResult]: """ Given a "runner" function `call`, interprets the mode @@ -644,22 +750,28 @@ def run_evaluation( require multiple runner calls. """ results: dict[str, EvalResult] = {} - if mode in ["test", "benchmark", "profile"]: - results[mode] = call(mode=mode) + if mode == "profile": + benchmarks = copy.deepcopy(common_args["benchmarks"]) + for i, benchmark in enumerate(benchmarks.splitlines()): + common_args["benchmarks"] = benchmark + results[f"{mode}.{i}"] = call(mode=mode, **common_args) + + elif mode in ["test", "benchmark"]: + results[mode] = call(mode=mode, **common_args) elif mode in ["private", "leaderboard"]: # first, run the tests - results["test"] = call(mode="test") + results["test"] = call(mode="test", **common_args) if not results["test"].run or not results["test"].run.passed: return results - results["benchmark"] = call(mode="benchmark") + results["benchmark"] = call(mode="benchmark", **common_args) if not results["benchmark"].run or not results["benchmark"].run.passed: return results # if they pass, run the leaderboard validation - results["leaderboard"] = call(mode="leaderboard") + results["leaderboard"] = call(mode="leaderboard", **common_args) else: raise AssertionError("Invalid mode") @@ -693,8 +805,7 @@ def run_config(config: dict): runner = functools.partial( run_pytorch_script, sources=config["sources"], - main=config["main"], - **common_args, + main=config["main"] ) elif config["lang"] == "cu": runner = functools.partial( @@ -706,10 +817,9 @@ def run_config(config: dict): include_dirs=config.get("include_dirs", []), libraries=config.get("libraries", []), flags=CUDA_FLAGS, - **common_args, ) else: raise ValueError(f"Invalid language {config['lang']}") - results = run_evaluation(runner, config["mode"]) + results = run_evaluation(runner, config["mode"], common_args) return FullResult(success=True, error="", runs=results, system=system) diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py index 6a048e62..d2cb0d64 100644 --- a/src/runners/modal_runner.py +++ b/src/runners/modal_runner.py @@ -16,7 +16,7 @@ # Move this to another file later: cuda_image = ( - Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12") + Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13") .apt_install( "git", "gcc-13", diff --git a/tests/test_report.py b/tests/test_report.py index a1964e62..9006a98e 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -6,6 +6,7 @@ from libkernelbot import consts from libkernelbot.report import ( + File, RunResultReport, _generate_compile_report, _short_fail_reason, @@ -402,7 +403,6 @@ def test_make_profile_log(): log = make_profile_log(run) - assert "Matrix multiplication profile" in log assert " Profile line 1" in log assert " Profile line 2" in log @@ -664,6 +664,7 @@ def test_generate_report_profile(sample_full_result: FullResult): } sample_full_result.runs["profile"].profile = ProfileResult( profiler="NSight", + trace="", download_url="https://example.com", ) report = generate_report(sample_full_result) @@ -687,8 +688,11 @@ def test_generate_report_profile(sample_full_result: FullResult): "❌ Test division\n" "> Division by zero", ), - Log(header="Profiling", content="Benchmark\n\n Profile report\n"), + Log(header='Profiling Benchmark', content=' Profile report\n'), Link("NSight profiling output", "Download from GitHub", "https://example.com"), + File(name='profile-Benchmark.zip', + message='NSight report - Benchmark', + content=b''), ]