@@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
500500 return 112
501501
502502
503- def _run_single_profile (test : TestCase ) -> str :
503+ def _run_single_profile_torch (test : TestCase ) -> str :
504504 """
505- Runs a single test case. Do not call directly
505+ Profiles a single benchmark using the torch profiler.
506506 """
507507 from submission import custom_kernel
508508 from torch .profiler import profile , ProfilerActivity
@@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str:
511511 data = generate_input (** test .args )
512512 torch .cuda .synchronize ()
513513
514+ cloned = _clone_data (data , 0 )
514515 with profile (activities = [ProfilerActivity .CPU , ProfilerActivity .CUDA ]) as prof :
515516 with nvtx_range ("custom_kernel" ):
516- submission_output = custom_kernel (_clone_data ( data , 0 ) )
517+ submission_output = custom_kernel (cloned )
517518 torch .cuda .synchronize ()
518519
519520 return prof .key_averages ().table (sort_by = "self_cuda_time_total" , row_limit = 20 )
520521
521522
523+ def _run_single_profile_ncu (test : TestCase ) -> str :
524+ """
525+ Profiles a single benchmark using ncu. Note: this does not
526+ invoke NCU; instead, it is expected that eval is launched
527+ under NCU, and this function will rurnthe kernel excactly
528+ once in the 'custom_kernel' nvtx range.
529+ """
530+ from submission import custom_kernel
531+
532+ with nvtx_range ("generate input" ):
533+ data = generate_input (** test .args )
534+ torch .cuda .synchronize ()
535+
536+ cloned = _clone_data (data , 0 )
537+ with nvtx_range ("custom_kernel" ):
538+ submission_output = custom_kernel (cloned )
539+ torch .cuda .synchronize ()
540+
541+ return ""
542+
543+
522544def _run_distributed_profile (test : TestCase , rank : int ) -> "EventList" :
523545 """
524546 Runs a single profiling case. Do not call directly
@@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
610632 """
611633 world_size = test .args .get ("world_size" , None )
612634 if world_size is None :
613- return pool .apply (_run_single_profile , (test ,))
635+ if bool (os .getenv ("POPCORN_NCU" , "0" )):
636+ return pool .apply (_run_single_profile_ncu , (test ,))
637+ else :
638+ return pool .apply (_run_single_profile_torch , (test ,))
614639 else :
615640 return run_multi_gpu_profile (pool , test , world_size )
616641
0 commit comments