update readme. small fixes

gau-nernst · gau-nernst · commit c4bb9e7a81ee · 2024-07-05T20:19:05.000+08:00
diff --git a/benchmarks/benchmark_low_bit_adam.py b/benchmarks/benchmark_low_bit_adam.py
@@ -11,6 +11,7 @@
 # To enable cosine learning rate scheduler, set --cosine_lr_scheduler
 
 import argparse
+import datetime
 import math
 from contextlib import nullcontext
 from functools import partial
@@ -175,6 +176,7 @@ def evaluate_model(model, args):
 
     grad_scaler = torch.amp.GradScaler("cuda", enabled=args.amp == "fp16")
 
+    start_time = datetime.datetime.now()
     step = 0
     for epoch_idx in range(args.n_epochs):
         model.train()
@@ -214,4 +216,5 @@ def evaluate_model(model, args):
             print(f"Epoch {epoch_idx + 1}/{args.n_epochs}: val_acc={val_acc.item() * 100:.2f}")
             logger.log(dict(val_acc=val_acc), step=step)
 
-    print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / (1 << 30):.2f} GB")
+    print(f"Time taken: {(datetime.datetime.now() - start_time)}")
+    print(f"Max used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
diff --git a/torchao/prototype/low_bit_optim/README.md b/torchao/prototype/low_bit_optim/README.md
@@ -31,12 +31,17 @@ NOTE:
 
 Benchmark script for fine-tuning a [timm](https://github.com/huggingface/pytorch-image-models) model on [resisc45](https://huggingface.co/datasets/timm/resisc45) dataset is available at [benchmarks/benchmark_low_bit_adam.py](../../../benchmarks/benchmark_low_bit_adam.py).
 
-Results for fine-tuning ViT-B with BF16 AMP, on 4070Ti SUPER:
+Results for fine-tuning ViT-H (630M params) with BF16 AMP, batch size 4, 1 epoch, on 4070Ti SUPER:
 
-TODO: update this table
+Adam impl  | max memory (GB) | time taken | accuracy
+-----------|-----------------|------------|----------
+PyTorch    | 12.98           | 10m 08s    | 87.70
+bnb 8-bit  |  8.31           |  8m 38s    | 86.22
+ao 8-bit   |  8.32           | 10m 54s    | 86.67
+lpmm 4-bit |  7.72           |  7m 48s    | 84.70
+ao 4-bit   |  7.72           |  9m 17s    | 85.60
 
-Adam impl | max memory (GB) | training time | accuracy
-----------|-----------------|---------------|----------
+NOTE: time taken includes validation time, and compile time for torchao optimizers.
 
 ## Credits
 
diff --git a/torchao/prototype/low_bit_optim/subclass_4bit.py b/torchao/prototype/low_bit_optim/subclass_4bit.py
@@ -49,9 +49,7 @@ def quantize_4bit_with_qmap(input: Tensor, qmap: Tensor, block_size: int, implem
         raise ValueError(f"Unsupported implementation={implementation}")
 
     # packing
-    codes1, codes2 = codes.chunk(2, 0)
-    codes = (codes1 << 4) | codes2
-
+    codes = (codes[::2] << 4) | codes[1::2]
     return codes, scale
 
 
@@ -90,9 +88,7 @@ def __tensor_unflatten__(cls, tensor_data_dict, tensor_attributes, outer_size=No
 
     def dequantize(self, output_dtype=None):
         # unpack
-        codes1 = self.codes >> 4
-        codes2 = self.codes & 0b1111
-        codes = torch.cat([codes1, codes2], 0)
+        codes = torch.stack([self.codes >> 4, self.codes & 0b1111], dim=-1)
 
         # torch.compile() cannot use uint8 as index
         float_data = self.qmap[codes.int()]