update benchmarks + README

jcaip · jcaip · commit 333a88f568c1 · 2024-09-06T05:56:15.000-07:00
diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt
@@ -38,3 +38,4 @@ kv cache quantization:
 20240826171015, tok/s=  1.95, mem/s=  29.21 GB/s, peak_mem=59.27 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072
 20240826172121, tok/s=  1.73, mem/s=  26.02 GB/s, peak_mem=52.62 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072--kv_cache_quantization
 20240826173230, tok/s=  1.73, mem/s=  25.95 GB/s, peak_mem=34.18 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072--kv_cache_quantization --linear_causal_mask
+20240906054415, tok/s=226.02, mem/s= 689.20 GB/s, peak_mem= 5.32 GB, model_size= 3.05 GB quant: marlin, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization marlin --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 
diff --git a/torchao/_models/llama/benchmarks.sh b/torchao/_models/llama/benchmarks.sh
@@ -30,13 +30,15 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
-
+# sparse marlin (NOTE: float16)
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
 # auto-round w/ quant_lm_head
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround
 # auto-round w/o quant_lm_head
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround-cuda-0
 
 
+
 export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -225,6 +225,9 @@ def main(
             groupsize=int(quantization.split("-")[-1])
             assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}"
             quantize_(model, int4_weight_only(group_size=groupsize))
+        if "marlin" in quantization:
+            from torchao.dtypes import MarlinSparseLayoutType
+            quantize_(model, int4_weight_only(layout_type=MarlinSparseLayoutType()))
         if "autoround" in quantization:
             from torchao.prototype.autoround.autoround_llm import quantize_model_with_autoround_
             from transformers import AutoTokenizer
diff --git a/torchao/_models/sam/benchmark.sh b/torchao/_models/sam/benchmark.sh
@@ -8,3 +8,5 @@ python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017
 python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress sparse
 # int8 dynamic quant + 2:4 sparsity (attn: int8, mlp lin1: int8+2:4 fuse mul, mlp lin2: 2:4 sparse)
 python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress int8_dynamic_quant_sparse
+# int8 dynamic quant attn + int4 wo + sparse marlin lin 1 + 2:4 sparse lin2
+python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half float16 --device cuda --compress int4_weight_only_sparse
diff --git a/torchao/_models/sam/eval_combo.py b/torchao/_models/sam/eval_combo.py
@@ -283,6 +283,16 @@ def run(
     for block in predictor.model.image_encoder.blocks:
         block.attn.use_rel_pos = use_rel_pos
 
+    # Helper filter functions
+    def attn_only(mod, name):
+        return isinstance(mod, torch.nn.Linear) and 'attn' in name
+    def mlp_lin1_only(mod, name):
+        return isinstance(mod, torch.nn.Linear) and 'lin1' in name
+    def mlp_lin2_only(mod, name):
+        return isinstance(mod, torch.nn.Linear) and 'lin2' in name
+    def mlp_only(mod, name):
+        return isinstance(mod, torch.nn.Linear) and 'mlp' in name
+
     if compress == "int8_dynamic_quant":
         quantize_(predictor.model.image_encoder, int8_dynamic_activation_int8_weight())
         if not TORCH_VERSION_AT_LEAST_2_5:
@@ -296,15 +306,6 @@ def mlp_only(mod, name):
         apply_fake_sparsity(predictor.model.image_encoder)
         sparsify_(predictor.model.image_encoder, semi_sparse_weight())
     elif compress == "int8_dynamic_quant_sparse":
-        def attn_only(mod, name):
-            return isinstance(mod, torch.nn.Linear) and 'attn' in name
-        def mlp_lin1_only(mod, name):
-            return isinstance(mod, torch.nn.Linear) and 'lin1' in name
-        def mlp_lin2_only(mod, name):
-            return isinstance(mod, torch.nn.Linear) and 'lin2' in name
-        def mlp_only(mod, name):
-            return isinstance(mod, torch.nn.Linear) and 'mlp' in name
-
         # apply sparsify first to set qparams
         apply_fake_sparsity(predictor.model.image_encoder,
                             filter_fn=mlp_only)
@@ -320,7 +321,20 @@ def mlp_only(mod, name):
                   mlp_lin2_only)
         if not TORCH_VERSION_AT_LEAST_2_5:
             predictor.model.image_encoder = unwrap_tensor_subclass(predictor.model.image_encoder)
-
+    elif compress == "int4_weight_only_sparse":
+        # apply sparsify first to set qparams
+        apply_fake_sparsity(predictor.model.image_encoder,
+                            filter_fn=mlp_only)
+        from torchao.dtypes import MarlinSparseLayoutType
+        quantize_(predictor.model.image_encoder,
+                  int8_dynamic_activation_int8_weight(),
+                  attn_only)
+        quantize_(predictor.model.image_encoder, int4_weight_only(layout_type=MarlinSparseLayoutType()), mlp_lin1_only)
+        sparsify_(predictor.model.image_encoder
+                  semi_sparse_weight(),
+                  mlp_lin2_only)
+        if not TORCH_VERSION_AT_LEAST_2_5:
+            predictor.model.image_encoder = unwrap_tensor_subclass(predictor.model.image_encoder)
     else:
         assert compress is None, f"Unsupported compress mode {compress}"
 
diff --git a/torchao/_models/sam/results.csv b/torchao/_models/sam/results.csv
@@ -4,3 +4,4 @@ cuda,vit_h,32,15154,18,25.16516896830006,39.73746416166231,0.5818834536577897,ma
 cuda,vit_h,32,15632,19,24.824717871078573,40.282431614863405,0.5675837487618974,max-autotune,torch.bfloat16,sparse_mlp_only,False,True,True,32,154,4928,None,None
 cuda,vit_h,32,13429,16,24.589577947798148,40.66763578142439,0.5306639662569573,max-autotune,torch.bfloat16,sparse,False,True,True,32,154,4928,None,None
 cuda,vit_h,32,14869,18,26.597207143088742,37.597932543073384,0.5669944616184625,max-autotune,torch.bfloat16,int8_dynamic_quant_sparse,False,True,True,32,154,4928,None,None
+cuda,vit_h,32,17068,21,23.96093702681232,41.73459489004953,0.5485481164943489,max-autotune,torch.float16,int4_weight_only_sparse,False,True,True,32,154,4928,None,None
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -5,24 +5,25 @@ Typically quantization algorithms will have different schemes for how the activa
 Benchmarks are run on a machine with a single A100 GPU using the script in _models/llama which generates text in a latency optimized way (batchsize=1), evaluation was done
 Using the lm_eval. The models used were meta-llama/Llama-2-7b-chat-hf and meta-llama/Meta-Llama-3-8B.
 
-| Model       | Technique          | wikitext-perplexity | Tokens/Second | Memory Bandwidth (GB/s) | Peak Memory (GB) | Model Size (GB) |
-| ----------- | ------------------ | ------------------- | ------------- | ----------------------- | ---------------- | --------------- |
-| Llama-2-7B  | Base (bfloat16)    | 12.212              |  107.38       | 1418.93                 | 13.88            | 13.21           |
-|             | int8dq             | 12.262              |    9.61       |   63.67                 |  8.61            |  6.62           |
-|             | int8wo             | 12.204              |  170.83       | 1131.18                 |  8.95            |  6.62           |
-|             | int4wo-64          | 12.843              |  201.14       |  751.42                 |  4.87            |  3.74           |
-|             | int4wo-64-GPTQ     | 12.527              |  201.14       |  751.42                 |  4.87            |  3.74           |
-|             | uintx-4-64         | 12.891              |  48.25        |  189.32                 |  6.29            |  3.92           |
-|             | uintx-2-8          | 28.766              |  36.11        |  238.58                 |  9.26            |  6.61           |
-|             | autoquant-int4hqq  | 12.825              |  209.19       |  804.32                 |  4.89            |  3.84           |
-| Llama-3-8B  | Base (bfloat16)    |  7.441              |   95.64       | 1435.54                 | 16.43            | 15.01           |
-|             | int8dq             |  7.581              |    8.61       |   64.75                 |  9.24            |  7.52           |
-|             | int8wo             |  7.447              |  153.03       | 1150.80                 | 10.42            |  7.52           |
-|             | int4wo-64          |  8.316              |  180.80       |  763.33                 |  6.88            |  4.22           |
-|             | int4wo-64-GPTQ     |  7.921              |  180.80       |  763.33                 |  6.88            |  4.22           |
-|             | uintx-4-64         |  8.113              |  47.77        |  212.90                 |  11.85           |  4.46           |
-|             | uintx-2-8          |  39.368             |  33.21        |  249.22                 |  15.04           |  7.51           |
-|             | autoquant-int4hqq  |  8.110              |  188.41       |  800.58                 |  7.14            |  4.25           |
+| Model       | Technique               | wikitext-perplexity | Tokens/Second | Memory Bandwidth (GB/s) | Peak Memory (GB) | Model Size (GB) |
+| ----------- | ----------------------- | ------------------- | ------------- | ----------------------- | ---------------- | --------------- |
+| Llama-2-7B  | Base (bfloat16)         | 12.212              |  107.38       | 1418.93                 | 13.88            | 13.21           |
+|             | int8dq                  | 12.262              |    9.61       |   63.67                 |  8.61            |  6.62           |
+|             | int8wo                  | 12.204              |  170.83       | 1131.18                 |  8.95            |  6.62           |
+|             | int4wo-64               | 12.843              |  201.14       |  751.42                 |  4.87            |  3.74           |
+|             | int4wo-64-GPTQ          | 12.527              |  201.14       |  751.42                 |  4.87            |  3.74           |
+|             | uintx-4-64              | 12.891              |  48.25        |  189.32                 |  6.29            |  3.92           |
+|             | uintx-2-8               | 28.766              |  36.11        |  238.58                 |  9.26            |  6.61           |
+|             | autoquant-int4hqq       | 12.825              |  209.19       |  804.32                 |  4.89            |  3.84           |
+| Llama-3-8B  | Base (bfloat16)         |  7.441              |   95.64       | 1435.54                 | 16.43            | 15.01           |
+|             | int8dq                  |  7.581              |    8.61       |   64.75                 |  9.24            |  7.52           |
+|             | int8wo                  |  7.447              |  153.03       | 1150.80                 | 10.42            |  7.52           |
+|             | int4wo-64               |  8.316              |  180.80       |  763.33                 |  6.88            |  4.22           |
+|             | int4wo-64-GPTQ          |  7.921              |  180.80       |  763.33                 |  6.88            |  4.22           |
+|             | int4wo-64-sparse-marlin |  N/A                |  226.02       |  689.20                 |  5.32            |  3.05           |
+|             | uintx-4-64              |  8.113              |  47.77        |  212.90                 |  11.85           |  4.46           |
+|             | uintx-2-8               |  39.368             |  33.21        |  249.22                 |  15.04           |  7.51           |
+|             | autoquant-int4hqq       |  8.110              |  188.41       |  800.58                 |  7.14            |  4.25           |
 
 note: Int8 dynamic quantization works best on compute bound models like [SAM](https://github.com/pytorch-labs/segment-anything-fast) whereas Llama with batchsize=1 tends to be memory bound, thus the rather low performance.