@@ -305,6 +305,112 @@ def run_program(
305305 )
306306
307307
308+ def profile_program_roc (
309+ call : list [str ],
310+ seed : Optional [int ],
311+ timeout : int ,
312+ multi_gpu : bool ,
313+ output_dir : Path ,
314+ ) -> tuple [RunResult , Optional [ProfileResult ]]:
315+ # Wrap program in rocprof
316+ call = [
317+ "rocprofv3" ,
318+ "--log-level" ,
319+ "fatal" ,
320+ "--hip-trace" ,
321+ "--kernel-trace" ,
322+ "--rccl-trace" ,
323+ "--marker-trace" ,
324+ "--hip-trace" ,
325+ "--memory-copy-trace" ,
326+ # New? Doesn't work in the runner
327+ # "--memory-allocation-trace",
328+ "--scratch-memory-trace" ,
329+ # The HSA trace output is very large, so skip it for now
330+ # "--hsa-trace",
331+ "--output-format" ,
332+ "pftrace" ,
333+ "csv" ,
334+ "-d" ,
335+ str (output_dir ),
336+ # Just store the files as %pid%_tracename.ext instead of putting them in an
337+ # additional directory named after the hostname.
338+ "-o" ,
339+ # Insert an extra path here so that the resulting zip has all files
340+ # in the profile_data/ directory rather than the root.
341+ "%pid%" ,
342+ "--" ,
343+ ] + call
344+
345+ run_result = run_program (call , seed = seed , timeout = timeout , multi_gpu = multi_gpu , extra_env = {
346+ "GPU_DUMP_CODE_OBJECT" : "1" ,
347+ },
348+ )
349+
350+ profile_result = None
351+
352+ if run_result .success :
353+ # Post-process trace data.
354+ # rocPROF generates one trace for every process, but its more useful to
355+ # have all traces be in the same file. Fortunately we can do that by
356+ # concatenating.
357+ traces = list (output_dir .glob ("*.pftrace" ))
358+ with (output_dir / "combined.pftrace" ).open ("wb" ) as combined :
359+ for trace_path in traces :
360+ with trace_path .open ("rb" ) as trace :
361+ shutil .copyfileobj (trace , combined )
362+
363+ # After we've created the combined trace, there is no point in
364+ # keeping the individual traces around.
365+ trace_path .unlink ()
366+
367+ # Also move the code objects to the profiling output directory.
368+ for code_obj in list (Path .cwd ().glob ("_code_object*.o" )):
369+ code_obj .rename (output_dir / code_obj .name )
370+
371+ profile_result = ProfileResult (
372+ profiler = "rocPROF" ,
373+ download_url = None ,
374+ )
375+
376+ return run_result , profile_result
377+
378+
379+ def profile_program_ncu (
380+ call : list [str ],
381+ seed : Optional [int ],
382+ timeout : int ,
383+ multi_gpu : bool ,
384+ output_dir : Path ,
385+ ) -> tuple [RunResult , Optional [ProfileResult ]]:
386+ assert not multi_gpu , "Multi-GPU profiling not supported for ncu."
387+
388+ # Wrap program in ncu
389+ call = [
390+ "ncu" ,
391+ "--set" , "full" ,
392+ "--nvtx" ,
393+ "--nvtx-include" , "custom_kernel/" ,
394+ "--import-source" , "1" ,
395+ "-o" , f"{ str (output_dir / 'profile.ncu-rep' )} " ,
396+ "--" ,
397+ ] + call
398+
399+ run_result = run_program (call , seed = seed , timeout = timeout , multi_gpu = multi_gpu , extra_env = {
400+ "POPCORN_NCU" : "1"
401+ })
402+
403+ profile_result = None
404+
405+ if run_result .success :
406+ profile_result = ProfileResult (
407+ profiler = 'ncu' ,
408+ download_url = None ,
409+ )
410+
411+ return run_result , profile_result
412+
413+
308414def profile_program (
309415 system : SystemInfo ,
310416 call : list [str ],
@@ -315,89 +421,25 @@ def profile_program(
315421 # The runner-specific configuration should implement logic
316422 # to fetch the data in this directory and return it as
317423 # ProfileResult.download_url.
318- # Insert an extra nested nested path here so that the resulting zip has all files
424+ # Insert an extra nested path here so that the resulting zip has all files
319425 # in the profile_data/ directory rather than directly in the root.
320426 output_dir = Path ("." ) / "profile_data" / "profile_data"
321427 output_dir .mkdir (parents = True , exist_ok = True )
322428
323429 if system .runtime == "ROCm" :
324- # Wrap program in rocprof
325- call = [
326- "rocprofv3" ,
327- "--log-level" ,
328- "fatal" ,
329- "--hip-trace" ,
330- "--kernel-trace" ,
331- "--rccl-trace" ,
332- "--marker-trace" ,
333- "--hip-trace" ,
334- "--memory-copy-trace" ,
335- # New? Doesn't work in the runner
336- # "--memory-allocation-trace",
337- "--scratch-memory-trace" ,
338- # The HSA trace output is very large, so skip it for now
339- # "--hsa-trace",
340- "--output-format" ,
341- "pftrace" ,
342- "csv" ,
343- "-d" ,
344- str (output_dir ),
345- # Just store the files as %pid%_tracename.ext instead of putting them in an
346- # additional directory named after the hostname.
347- "-o" ,
348- # Insert an extra path here so that the resulting zip has all files
349- # in the profile_data/ directory rather than the root.
350- "%pid%" ,
351- "--" ,
352- ] + call
353-
354- run_result = run_program (
355- call ,
356- seed = seed ,
357- timeout = timeout ,
358- multi_gpu = multi_gpu ,
359- extra_env = {
360- "GPU_DUMP_CODE_OBJECT" : "1" ,
361- },
362- )
363-
364- profile_result = None
365-
366- if run_result .success :
367- # Post-process trace data.
368- # rocPROF generates one trace for every process, but its more useful to
369- # have all traces be in the same file. Fortunately we can do that by
370- # concatenating.
371- traces = list (output_dir .glob ("*.pftrace" ))
372- with (output_dir / "combined.pftrace" ).open ("wb" ) as combined :
373- for trace_path in traces :
374- with trace_path .open ("rb" ) as trace :
375- shutil .copyfileobj (trace , combined )
376-
377- # After we've created the combined trace, there is no point in
378- # keeping the individual traces around.
379- trace_path .unlink ()
380-
381- # Also move the code objects to the profiling output directory.
382- for code_obj in list (Path .cwd ().glob ("_code_object*.o" )):
383- code_obj .rename (output_dir / code_obj .name )
384-
385- profile_result = ProfileResult (
386- profiler = "rocPROF" ,
387- download_url = None ,
388- )
389-
390- return run_result , profile_result
430+ return profile_program_roc (call , seed , timeout , multi_gpu , output_dir )
431+ elif system .runtime == "CUDA" :
432+ return profile_program_ncu (call , seed , timeout , multi_gpu , output_dir )
391433 else :
392- # TODO: Implement profiling for other platforms
393- return run_program ( call , seed = seed , timeout = timeout , multi_gpu = multi_gpu ), None
434+ raise ValueError ( f"Unknown runtime { system . runtime } " )
435+
394436
395437
396438def run_single_evaluation (
397- system : SystemInfo ,
398439 call : list [str ],
399440 mode : str ,
400441 * ,
442+ system : SystemInfo ,
401443 multi_gpu : bool = False ,
402444 tests : Optional [str ] = None ,
403445 benchmarks : Optional [str ] = None ,
@@ -426,7 +468,7 @@ def run_single_evaluation(
426468
427469 cases .flush ()
428470
429- call += [mode , cases .name ]
471+ call = call + [mode , cases .name ]
430472
431473 if mode == "profile" :
432474 return profile_program (system , call , seed = seed , timeout = timeout , multi_gpu = multi_gpu )
@@ -498,7 +540,6 @@ def make_system_info() -> SystemInfo: # noqa: C901
498540
499541
500542def run_cuda_script ( # # noqa: C901
501- system : SystemInfo ,
502543 sources : dict [str , str ],
503544 headers : Optional [dict [str , str ]] = None ,
504545 arch : Optional [int ] = None ,
@@ -559,7 +600,7 @@ def run_cuda_script( # # noqa: C901
559600 if os .path .exists (f ):
560601 os .remove (f )
561602
562- run_result , profile_result = run_single_evaluation (system , ["./eval.out" ], ** kwargs )
603+ run_result , profile_result = run_single_evaluation (["./eval.out" ], ** kwargs )
563604 return EvalResult (
564605 start = start ,
565606 end = datetime .datetime .now (),
@@ -570,7 +611,6 @@ def run_cuda_script( # # noqa: C901
570611
571612
572613def run_pytorch_script ( # noqa: C901
573- system : SystemInfo ,
574614 sources : dict [str , str ],
575615 main : str ,
576616 ** kwargs ,
@@ -622,7 +662,7 @@ def run_pytorch_script( # noqa: C901
622662 exit_code = e .returncode ,
623663 )
624664
625- run , profile = run_single_evaluation (system , ["python3" , main ], ** kwargs )
665+ run , profile = run_single_evaluation (["python3" , main ], ** kwargs )
626666
627667 return EvalResult (
628668 start = start ,
0 commit comments