@@ -2,66 +2,69 @@ export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
22
33# README BENCHMARKS
44export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
5- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --write_result benchmark_results.txt
6- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int8dq --write_result benchmark_results.txt
7- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int8wo --write_result benchmark_results.txt
8- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
9- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
10- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
5+ python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --quantization embed --compile
6+ python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile
7+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile
8+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
9+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt
10+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
11+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
12+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
13+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
1114
12- export MODEL_REPO=meta-llama/Meta-Llama-3-8B
13- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --write_result benchmark_results.txt
14- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int8dq --write_result benchmark_results.txt
15- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int8wo --write_result benchmark_results.txt
16- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
17- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
18- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
15+ # export MODEL_REPO=meta-llama/Meta-Llama-3-8B
16+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
17+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt
18+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
19+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
20+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
21+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
1922
20- export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
21- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --write_result benchmark_results.txt
22- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int8wo --write_result benchmark_results.txt
23- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
24- # Runs on H100, float8 is not supported on CUDA arch < 8.9
25- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization float8wo --write_result benchmark_results.txt
26- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization float8dq-tensor --write_result benchmark_results.txt
27- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization float8dq-wo --write_result benchmark_results.txt
23+ # export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
24+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
25+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
26+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
27+ # # Runs on H100, float8 is not supported on CUDA arch < 8.9
28+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8wo --write_result benchmark_results.txt
29+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8dq-tensor --write_result benchmark_results.txt
30+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8dq-wo --write_result benchmark_results.txt
2831
29- # OTHER BENCHMARKS
32+ # # OTHER BENCHMARKS
3033
31- # kv cache quantization
32- export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
33- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 8192
34- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization
35- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization --linear_causal_mask
36- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 16384
37- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization
38- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization --linear_causal_mask
39- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 32768
40- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization
41- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization --linear_causal_mask
42- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 65536
43- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization
44- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization --linear_causal_mask
45- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 131072
46- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization
47- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization --linear_causal_mask
34+ # # kv cache quantization
35+ # export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
36+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192
37+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization
38+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization --linear_causal_mask
39+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384
40+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization
41+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization --linear_causal_mask
42+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768
43+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization
44+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization --linear_causal_mask
45+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536
46+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization
47+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization --linear_causal_mask
48+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072
49+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization
50+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization --linear_causal_mask
4851
49- export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
50- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --precision torch.float32 --write_result benchmark_results.txt
51- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt
52- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --compile_prefill --write_result benchmark_results.txt
53- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
54- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization fp6 --write_result benchmark_results.txt
55- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
56- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
57- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
52+ # export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
53+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
54+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
55+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt
56+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
57+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt
58+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
59+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
60+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
5861
59- export MODEL_REPO=meta-llama/Meta-Llama-3-8B
60- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --precision torch.float32 --write_result benchmark_results.txt
61- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --write_result benchmark_results.txt
62- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --compile_prefill --write_result benchmark_results.txt
63- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
64- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
65- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
66- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
67- python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
62+ # export MODEL_REPO=meta-llama/Meta-Llama-3-8B
63+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
64+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
65+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt
66+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
67+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
68+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
69+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
70+ # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
0 commit comments