Add profile support

jainapurva · jainapurva · commit d71baa35e23e · 2025-03-25T15:28:18.000-07:00
ghstack-source-id: df82466 ghstack-comment-id: 2752688926 Pull Request resolved: #1960
diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -20,6 +20,7 @@
     BenchmarkResult,
     clean_caches,
     create_model_and_input,
+    generate_model_profile,
     model_inference_time_in_ms,
     string_to_config,
 )
@@ -84,10 +85,9 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         model=m_copy, input_data=input_data
     )
 
-    # TODO: Benchmark time using profiler
     # Profile dtype model evaluation
-    # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype)
-    # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json")  # Save profiling details
+    prof = generate_model_profile(m_copy, input_data)
+    prof.export_chrome_trace(f"{config.profile_path}.json")  # Save profiling details
 
     # TODO: Benchmark gemm time using cuda graph
     # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs)
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -8,7 +8,7 @@ sparsity_config_recipe_names:
   # Will run a baseline inference for model by default, without sparsity for comparison
   - "semi-sparse"
   - "block"
-output_dir: "benchmarks/microbenchmarks/results"
+output_dir: "benchmarks/microbenchmarks/results/"
 model_params:
   - name: "small_bf16_linear"
     matrix_shapes:
@@ -21,6 +21,7 @@ model_params:
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
+    profile: True
 
   - name: "large_bf16_ln_linear"
     matrix_shapes:
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
@@ -27,6 +27,12 @@
     UIntXWeightOnlyConfig,
 )
 from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig
+from torch.profiler import profile, record_function, ProfilerActivity
+import os
+import subprocess
+import sys
+import uuid
+
 
 try:
     import triton  # noqa: F401
@@ -84,6 +90,8 @@ def __init__(
             "name",
             f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}",
         )
+        self.profile = params.get("profile", False)
+        self.profile_file_name = os.path.join(self.output_dir, f"/profile/{self.name}_{self.m}_{self.k}_{self.n}_profile.json")
 
     @staticmethod
     def _parse_precision(precision_str: str) -> torch.dtype:
@@ -105,6 +113,7 @@ def to_dict(self) -> Dict[str, Any]:
             "device": self.device,
             "model_type": self.model_type,
             "output_dir": self.output_dir,
+            "profile": self.profile,
         }
 
 
@@ -319,6 +328,61 @@ def model_inference_time_in_ms(model, input_data):
     return res * 1e6
 
 
+def upload_trace_file(local_path: str, overwrite: bool = False) -> Optional[str]:
+    file_name = os.path.basename(local_path)
+    manifold_path = os.path.join(
+        "perfetto_internal_traces/tree/shared_trace", f"{os.getlogin()}_{str(uuid.uuid4())}_{file_name}"
+    )
+    cmd = [
+        "manifold",
+        "put",
+        local_path,
+        manifold_path,
+        "--ttl",
+        str(28 * 24 * 60 * 60),
+        "--userData",
+        "false",
+    ]
+    ret = subprocess.run(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
+    )
+    if ret.returncode == 0:
+        print("Upload trace successfully.")
+        return manifold_path
+    else:
+        print("[ERROR] Upload failed, maybe the trace file exists.")
+        return None
+
+
+def print_perfetto_ui_url(manifold_path: str) -> None:
+    url = (
+        "https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html"
+        + "#!/?url=https://interncache-all.fbcdn.net/manifold/"
+        + manifold_path
+    )
+    print(f"The trace is accessible at:\n{url}")
+
+
+def generate_model_profile(model, input_data, profile_file):
+    # Function to benchmark model evaluation with profiling
+    torch.profiler._utils._init_for_cuda_graphs()
+    prof = torch.profiler.profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True)
+    with prof:
+        # with record_function("model_inference"):
+        for _ in range(1):  # Run the model multiple times to warm up the cache
+            with torch.no_grad():
+                _ = model(*input_data)
+                torch.cuda.synchronize()
+    prof.export_chrome_trace(profile_file)  # Save profiling details
+    
+    manifold_path = upload_trace_file(profile_file)
+    if manifold_path:
+        print_perfetto_ui_url(manifold_path)
+
+    # Return the profiler output
+    return prof
+
+
 def create_model_and_input(
     model_type: str,
     m: int,