Skip to content

Commit d754094

Browse files
committed
split profiling into rocm/ncu;
small code improvements
1 parent 1de31fd commit d754094

File tree

2 files changed

+145
-80
lines changed

2 files changed

+145
-80
lines changed

examples/eval.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
500500
return 112
501501

502502

503-
def _run_single_profile(test: TestCase) -> str:
503+
def _run_single_profile_torch(test: TestCase) -> str:
504504
"""
505-
Runs a single test case. Do not call directly
505+
Profiles a single benchmark using the torch profiler.
506506
"""
507507
from submission import custom_kernel
508508
from torch.profiler import profile, ProfilerActivity
@@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str:
511511
data = generate_input(**test.args)
512512
torch.cuda.synchronize()
513513

514+
cloned = _clone_data(data, 0)
514515
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
515516
with nvtx_range("custom_kernel"):
516-
submission_output = custom_kernel(_clone_data(data, 0))
517+
submission_output = custom_kernel(cloned)
517518
torch.cuda.synchronize()
518519

519520
return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
520521

521522

523+
def _run_single_profile_ncu(test: TestCase) -> str:
524+
"""
525+
Profiles a single benchmark using ncu. Note: this does not
526+
invoke NCU; instead, it is expected that eval is launched
527+
under NCU, and this function will rurnthe kernel excactly
528+
once in the 'custom_kernel' nvtx range.
529+
"""
530+
from submission import custom_kernel
531+
532+
with nvtx_range("generate input"):
533+
data = generate_input(**test.args)
534+
torch.cuda.synchronize()
535+
536+
cloned = _clone_data(data, 0)
537+
with nvtx_range("custom_kernel"):
538+
submission_output = custom_kernel(cloned)
539+
torch.cuda.synchronize()
540+
541+
return ""
542+
543+
522544
def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
523545
"""
524546
Runs a single profiling case. Do not call directly
@@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
610632
"""
611633
world_size = test.args.get("world_size", None)
612634
if world_size is None:
613-
return pool.apply(_run_single_profile, (test,))
635+
if bool(os.getenv("POPCORN_NCU", "0")):
636+
return pool.apply(_run_single_profile_ncu, (test,))
637+
else:
638+
return pool.apply(_run_single_profile_torch, (test,))
614639
else:
615640
return run_multi_gpu_profile(pool, test, world_size)
616641

src/libkernelbot/run_eval.py

Lines changed: 116 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,112 @@ def run_program(
305305
)
306306

307307

308+
def profile_program_roc(
309+
call: list[str],
310+
seed: Optional[int],
311+
timeout: int,
312+
multi_gpu: bool,
313+
output_dir: Path,
314+
) -> tuple[RunResult, Optional[ProfileResult]]:
315+
# Wrap program in rocprof
316+
call = [
317+
"rocprofv3",
318+
"--log-level",
319+
"fatal",
320+
"--hip-trace",
321+
"--kernel-trace",
322+
"--rccl-trace",
323+
"--marker-trace",
324+
"--hip-trace",
325+
"--memory-copy-trace",
326+
# New? Doesn't work in the runner
327+
# "--memory-allocation-trace",
328+
"--scratch-memory-trace",
329+
# The HSA trace output is very large, so skip it for now
330+
# "--hsa-trace",
331+
"--output-format",
332+
"pftrace",
333+
"csv",
334+
"-d",
335+
str(output_dir),
336+
# Just store the files as %pid%_tracename.ext instead of putting them in an
337+
# additional directory named after the hostname.
338+
"-o",
339+
# Insert an extra path here so that the resulting zip has all files
340+
# in the profile_data/ directory rather than the root.
341+
"%pid%",
342+
"--",
343+
] + call
344+
345+
run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
346+
"GPU_DUMP_CODE_OBJECT": "1",
347+
},
348+
)
349+
350+
profile_result = None
351+
352+
if run_result.success:
353+
# Post-process trace data.
354+
# rocPROF generates one trace for every process, but its more useful to
355+
# have all traces be in the same file. Fortunately we can do that by
356+
# concatenating.
357+
traces = list(output_dir.glob("*.pftrace"))
358+
with (output_dir / "combined.pftrace").open("wb") as combined:
359+
for trace_path in traces:
360+
with trace_path.open("rb") as trace:
361+
shutil.copyfileobj(trace, combined)
362+
363+
# After we've created the combined trace, there is no point in
364+
# keeping the individual traces around.
365+
trace_path.unlink()
366+
367+
# Also move the code objects to the profiling output directory.
368+
for code_obj in list(Path.cwd().glob("_code_object*.o")):
369+
code_obj.rename(output_dir / code_obj.name)
370+
371+
profile_result = ProfileResult(
372+
profiler="rocPROF",
373+
download_url=None,
374+
)
375+
376+
return run_result, profile_result
377+
378+
379+
def profile_program_ncu(
380+
call: list[str],
381+
seed: Optional[int],
382+
timeout: int,
383+
multi_gpu: bool,
384+
output_dir: Path,
385+
) -> tuple[RunResult, Optional[ProfileResult]]:
386+
assert not multi_gpu, "Multi-GPU profiling not supported for ncu."
387+
388+
# Wrap program in ncu
389+
call = [
390+
"ncu",
391+
"--set", "full",
392+
"--nvtx",
393+
"--nvtx-include", "custom_kernel/",
394+
"--import-source", "1",
395+
"-o", f"{str(output_dir / 'profile.ncu-rep')}",
396+
"--",
397+
] + call
398+
399+
run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
400+
"POPCORN_NCU": "1"
401+
})
402+
403+
profile_result = None
404+
405+
if run_result.success:
406+
profile_result = ProfileResult(
407+
profiler='ncu',
408+
download_url=None,
409+
)
410+
411+
return run_result, profile_result
412+
413+
308414
def profile_program(
309415
system: SystemInfo,
310416
call: list[str],
@@ -315,89 +421,25 @@ def profile_program(
315421
# The runner-specific configuration should implement logic
316422
# to fetch the data in this directory and return it as
317423
# ProfileResult.download_url.
318-
# Insert an extra nested nested path here so that the resulting zip has all files
424+
# Insert an extra nested path here so that the resulting zip has all files
319425
# in the profile_data/ directory rather than directly in the root.
320426
output_dir = Path(".") / "profile_data" / "profile_data"
321427
output_dir.mkdir(parents=True, exist_ok=True)
322428

323429
if system.runtime == "ROCm":
324-
# Wrap program in rocprof
325-
call = [
326-
"rocprofv3",
327-
"--log-level",
328-
"fatal",
329-
"--hip-trace",
330-
"--kernel-trace",
331-
"--rccl-trace",
332-
"--marker-trace",
333-
"--hip-trace",
334-
"--memory-copy-trace",
335-
# New? Doesn't work in the runner
336-
# "--memory-allocation-trace",
337-
"--scratch-memory-trace",
338-
# The HSA trace output is very large, so skip it for now
339-
# "--hsa-trace",
340-
"--output-format",
341-
"pftrace",
342-
"csv",
343-
"-d",
344-
str(output_dir),
345-
# Just store the files as %pid%_tracename.ext instead of putting them in an
346-
# additional directory named after the hostname.
347-
"-o",
348-
# Insert an extra path here so that the resulting zip has all files
349-
# in the profile_data/ directory rather than the root.
350-
"%pid%",
351-
"--",
352-
] + call
353-
354-
run_result = run_program(
355-
call,
356-
seed=seed,
357-
timeout=timeout,
358-
multi_gpu=multi_gpu,
359-
extra_env={
360-
"GPU_DUMP_CODE_OBJECT": "1",
361-
},
362-
)
363-
364-
profile_result = None
365-
366-
if run_result.success:
367-
# Post-process trace data.
368-
# rocPROF generates one trace for every process, but its more useful to
369-
# have all traces be in the same file. Fortunately we can do that by
370-
# concatenating.
371-
traces = list(output_dir.glob("*.pftrace"))
372-
with (output_dir / "combined.pftrace").open("wb") as combined:
373-
for trace_path in traces:
374-
with trace_path.open("rb") as trace:
375-
shutil.copyfileobj(trace, combined)
376-
377-
# After we've created the combined trace, there is no point in
378-
# keeping the individual traces around.
379-
trace_path.unlink()
380-
381-
# Also move the code objects to the profiling output directory.
382-
for code_obj in list(Path.cwd().glob("_code_object*.o")):
383-
code_obj.rename(output_dir / code_obj.name)
384-
385-
profile_result = ProfileResult(
386-
profiler="rocPROF",
387-
download_url=None,
388-
)
389-
390-
return run_result, profile_result
430+
return profile_program_roc(call, seed, timeout, multi_gpu, output_dir)
431+
elif system.runtime == "CUDA":
432+
return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir)
391433
else:
392-
# TODO: Implement profiling for other platforms
393-
return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
434+
raise ValueError(f"Unknown runtime {system.runtime}")
435+
394436

395437

396438
def run_single_evaluation(
397-
system: SystemInfo,
398439
call: list[str],
399440
mode: str,
400441
*,
442+
system: SystemInfo,
401443
multi_gpu: bool = False,
402444
tests: Optional[str] = None,
403445
benchmarks: Optional[str] = None,
@@ -426,7 +468,7 @@ def run_single_evaluation(
426468

427469
cases.flush()
428470

429-
call += [mode, cases.name]
471+
call = call + [mode, cases.name]
430472

431473
if mode == "profile":
432474
return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
@@ -498,7 +540,6 @@ def make_system_info() -> SystemInfo: # noqa: C901
498540

499541

500542
def run_cuda_script( # # noqa: C901
501-
system: SystemInfo,
502543
sources: dict[str, str],
503544
headers: Optional[dict[str, str]] = None,
504545
arch: Optional[int] = None,
@@ -559,7 +600,7 @@ def run_cuda_script( # # noqa: C901
559600
if os.path.exists(f):
560601
os.remove(f)
561602

562-
run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs)
603+
run_result, profile_result = run_single_evaluation(["./eval.out"], **kwargs)
563604
return EvalResult(
564605
start=start,
565606
end=datetime.datetime.now(),
@@ -570,7 +611,6 @@ def run_cuda_script( # # noqa: C901
570611

571612

572613
def run_pytorch_script( # noqa: C901
573-
system: SystemInfo,
574614
sources: dict[str, str],
575615
main: str,
576616
**kwargs,
@@ -622,7 +662,7 @@ def run_pytorch_script( # noqa: C901
622662
exit_code=e.returncode,
623663
)
624664

625-
run, profile = run_single_evaluation(system, ["python3", main], **kwargs)
665+
run, profile = run_single_evaluation(["python3", main], **kwargs)
626666

627667
return EvalResult(
628668
start=start,

0 commit comments

Comments
 (0)