Skip to content

Commit d71baa3

Browse files
committed
Add profile support
ghstack-source-id: df82466 ghstack-comment-id: 2752688926 Pull Request resolved: #1960
1 parent ae70fec commit d71baa3

File tree

3 files changed

+69
-4
lines changed

3 files changed

+69
-4
lines changed

benchmarks/microbenchmarks/benchmark_inference.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
BenchmarkResult,
2121
clean_caches,
2222
create_model_and_input,
23+
generate_model_profile,
2324
model_inference_time_in_ms,
2425
string_to_config,
2526
)
@@ -84,10 +85,9 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
8485
model=m_copy, input_data=input_data
8586
)
8687

87-
# TODO: Benchmark time using profiler
8888
# Profile dtype model evaluation
89-
# prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype)
90-
# prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json") # Save profiling details
89+
prof = generate_model_profile(m_copy, input_data)
90+
prof.export_chrome_trace(f"{config.profile_path}.json") # Save profiling details
9191

9292
# TODO: Benchmark gemm time using cuda graph
9393
# gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs)

benchmarks/microbenchmarks/test/benchmark_config.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ sparsity_config_recipe_names:
88
# Will run a baseline inference for model by default, without sparsity for comparison
99
- "semi-sparse"
1010
- "block"
11-
output_dir: "benchmarks/microbenchmarks/results"
11+
output_dir: "benchmarks/microbenchmarks/results/"
1212
model_params:
1313
- name: "small_bf16_linear"
1414
matrix_shapes:
@@ -21,6 +21,7 @@ model_params:
2121
torch_compile_mode: "max-autotune"
2222
device: "cuda"
2323
model_type: "linear"
24+
profile: True
2425

2526
- name: "large_bf16_ln_linear"
2627
matrix_shapes:

benchmarks/microbenchmarks/utils.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@
2727
UIntXWeightOnlyConfig,
2828
)
2929
from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig
30+
from torch.profiler import profile, record_function, ProfilerActivity
31+
import os
32+
import subprocess
33+
import sys
34+
import uuid
35+
3036

3137
try:
3238
import triton # noqa: F401
@@ -84,6 +90,8 @@ def __init__(
8490
"name",
8591
f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}",
8692
)
93+
self.profile = params.get("profile", False)
94+
self.profile_file_name = os.path.join(self.output_dir, f"/profile/{self.name}_{self.m}_{self.k}_{self.n}_profile.json")
8795

8896
@staticmethod
8997
def _parse_precision(precision_str: str) -> torch.dtype:
@@ -105,6 +113,7 @@ def to_dict(self) -> Dict[str, Any]:
105113
"device": self.device,
106114
"model_type": self.model_type,
107115
"output_dir": self.output_dir,
116+
"profile": self.profile,
108117
}
109118

110119

@@ -319,6 +328,61 @@ def model_inference_time_in_ms(model, input_data):
319328
return res * 1e6
320329

321330

331+
def upload_trace_file(local_path: str, overwrite: bool = False) -> Optional[str]:
332+
file_name = os.path.basename(local_path)
333+
manifold_path = os.path.join(
334+
"perfetto_internal_traces/tree/shared_trace", f"{os.getlogin()}_{str(uuid.uuid4())}_{file_name}"
335+
)
336+
cmd = [
337+
"manifold",
338+
"put",
339+
local_path,
340+
manifold_path,
341+
"--ttl",
342+
str(28 * 24 * 60 * 60),
343+
"--userData",
344+
"false",
345+
]
346+
ret = subprocess.run(
347+
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
348+
)
349+
if ret.returncode == 0:
350+
print("Upload trace successfully.")
351+
return manifold_path
352+
else:
353+
print("[ERROR] Upload failed, maybe the trace file exists.")
354+
return None
355+
356+
357+
def print_perfetto_ui_url(manifold_path: str) -> None:
358+
url = (
359+
"https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html"
360+
+ "#!/?url=https://interncache-all.fbcdn.net/manifold/"
361+
+ manifold_path
362+
)
363+
print(f"The trace is accessible at:\n{url}")
364+
365+
366+
def generate_model_profile(model, input_data, profile_file):
367+
# Function to benchmark model evaluation with profiling
368+
torch.profiler._utils._init_for_cuda_graphs()
369+
prof = torch.profiler.profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True)
370+
with prof:
371+
# with record_function("model_inference"):
372+
for _ in range(1): # Run the model multiple times to warm up the cache
373+
with torch.no_grad():
374+
_ = model(*input_data)
375+
torch.cuda.synchronize()
376+
prof.export_chrome_trace(profile_file) # Save profiling details
377+
378+
manifold_path = upload_trace_file(profile_file)
379+
if manifold_path:
380+
print_perfetto_ui_url(manifold_path)
381+
382+
# Return the profiler output
383+
return prof
384+
385+
322386
def create_model_and_input(
323387
model_type: str,
324388
m: int,

0 commit comments

Comments
 (0)