pytorch
diff --git a/‎.github/workflows/torchao_experimental_test.yml‎
Lines changed: 10 additions & 1 deletion b/‎.github/workflows/torchao_experimental_test.yml‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmarks/benchmark_low_bit_adam.py‎
Lines changed: 6 additions & 8 deletions b/‎benchmarks/benchmark_low_bit_adam.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎benchmarks/benchmark_rowwise_scaled_linear_cutlass.py‎
Lines changed: 35 additions & 21 deletions b/‎benchmarks/benchmark_rowwise_scaled_linear_cutlass.py‎
Lines changed: 35 additions & 21 deletions
diff --git a/‎benchmarks/benchmark_rowwise_scaled_linear_sparse_cutlass.py‎
Lines changed: 72 additions & 0 deletions b/‎benchmarks/benchmark_rowwise_scaled_linear_sparse_cutlass.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 4 additions & 1 deletion b/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎benchmarks/float8/training/README.md‎
Lines changed: 5 additions & 5 deletions b/‎benchmarks/float8/training/README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎benchmarks/quantized_training/pretrain_llama2.py‎
Lines changed: 6 additions & 7 deletions b/‎benchmarks/quantized_training/pretrain_llama2.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎docs/source/api_ref_dtypes.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/api_ref_dtypes.rst‎
Lines changed: 1 addition & 0 deletions
@@ -33,7 +33,10 @@ jobs:
       - name: Install requirements
         run: |
           conda activate venv
-          pip install torch --index-url "https://download.pytorch.org/whl/nightly/cpu"
+          # Install executorch first because it installs its own version
+          # of torch and torchao, which we do not want to use
+          pip install executorch
+          pip install torch --index-url "https://download.pytorch.org/whl/nightly/cpu" --force-reinstall
           pip install numpy
           pip install pytest
           pip install parameterized
@@ -57,6 +60,12 @@ jobs:
           sh build_and_run_tests.sh
           rm -rf /tmp/cmake-out
           popd
+      - name: ET ops build
+        run: |
+          conda activate venv
+          pushd torchao/experimental
+          sh build_torchao_ops.sh executorch
+          popd
 
   test-mps-ops:
     strategy:
 
@@ -115,13 +115,13 @@ swap_linear_with_semi_sparse_linear(model, {"seq.0": SemiSparseLinear})
 ADAM takes 2x as much memory as the model params so we can quantize the optimizer state to either 8 or 4 bit effectively reducing the optimizer VRAM requirements by 2x or 4x respectively over an fp16 baseline
 
 ```python
-from torchao.prototype.low_bit_optim import AdamW8bit, AdamW4bit, AdamWFp8
+from torchao.optim import AdamW8bit, AdamW4bit, AdamWFp8
 optim = AdamW8bit(model.parameters()) # replace with Adam4bit and AdamFp8 for the 4 / fp8 versions
 ```
 
-In practice, we are a tiny bit slower than expertly written kernels but the implementations for these optimizers were written in a **few hundred lines of PyTorch code** and compiled so please use them or copy-paste them for your quantized optimizers. Benchmarks [here](https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim)
+In practice, we are a tiny bit slower than expertly written kernels but the implementations for these optimizers were written in a **few hundred lines of PyTorch code** and compiled so please use them or copy-paste them for your quantized optimizers. Benchmarks [here](https://github.com/pytorch/ao/tree/main/torchao/optim)
 
-We also have support for [single GPU CPU offloading](https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload) where both the gradients (same size as weights) and the optimizers will be efficiently sent to the CPU. This alone can **reduce your VRAM requirements by 60%**
+We also have support for [single GPU CPU offloading](https://github.com/pytorch/ao/tree/main/torchao/optim#optimizer-cpu-offload) where both the gradients (same size as weights) and the optimizers will be efficiently sent to the CPU. This alone can **reduce your VRAM requirements by 60%**
 
 ```python
 optim = CPUOffloadOptimizer(model.parameters(), torch.optim.AdamW, fused=True)
 
@@ -34,7 +34,7 @@
 from torchvision.transforms import v2
 from tqdm import tqdm
 
-from torchao.prototype import low_bit_optim
+from torchao import optim
 from torchao.utils import get_available_devices
 
 _DEVICE = get_available_devices()[-1]
@@ -43,9 +43,9 @@
 OPTIM_MAP = dict(
     AdamW=partial(torch.optim.AdamW, fused=True),
     AdamW8bitBnb=bnb.optim.AdamW8bit,
-    AdamW8bitAo=low_bit_optim.AdamW8bit,
-    AdamWFp8Ao=low_bit_optim.AdamWFp8,
-    AdamW4bitAo=low_bit_optim.AdamW4bit,
+    AdamW8bitAo=optim.AdamW8bit,
+    AdamWFp8Ao=optim.AdamWFp8,
+    AdamW4bitAo=optim.AdamW4bit,
 )
 
 try:
@@ -249,12 +249,10 @@ def evaluate_model(model, args):
         optim_cls = OPTIM_MAP[args.optim]
 
         if args.optim_cpu_offload == "ao":
-            optim_cls = partial(
-                low_bit_optim.CPUOffloadOptimizer, optimizer_class=optim_cls
-            )
+            optim_cls = partial(optim.CPUOffloadOptimizer, optimizer_class=optim_cls)
         elif args.optim_cpu_offload == "ao_offload_grads":
             optim_cls = partial(
-                low_bit_optim.CPUOffloadOptimizer,
+                optim.CPUOffloadOptimizer,
                 optimizer_class=optim_cls,
                 offload_gradients=True,
             )
 
@@ -7,41 +7,55 @@
     rowwise_scaled_linear_cutlass_s4s4,
     rowwise_scaled_linear_cutlass_s8s4,
 )
+from torchao.quantization.quant_api import (
+    _int4_symm_cutlass_quant,
+    _int8_symm_cutlass_quant,
+)
+
+dtype = torch.bfloat16
+dtypeq = torch.int8
+dtype_scale = torch.float32
+device = torch.device("cuda")
 
 
 def benchmark_microseconds(f, *args):
     return do_bench(lambda: f(*args), return_mode="median") * 1e3
 
 
-def get_problem(m: int, n: int, k: int, A_nbits: int, B_nbits: int):
-    assert A_nbits in (4, 8) and B_nbits in (4, 8)
+def get_problem(m: int, n: int, k: int, Xq_nbits: int):
+    assert k % 2 == 0
+    assert Xq_nbits in [4, 8]
+
+    X_ref = torch.randn((m, k), dtype=dtype, device=device)
+    W_ref = torch.rand((n, k), dtype=dtype, device=device)
 
-    dev = torch.device("cuda")
-    A = torch.randint(-128, 127, (m, k * A_nbits // 8), dtype=torch.int8, device=dev)
-    A_scale = torch.randn((m,), dtype=torch.half, device=dev)
-    B = torch.randint(
-        -128, 127, size=(n, k * B_nbits // 8), dtype=torch.int8, device=dev
+    X_quant_func = (
+        _int4_symm_cutlass_quant if Xq_nbits == 4 else _int8_symm_cutlass_quant
     )
-    B_scale = torch.randn((n,), dtype=torch.half, device=dev)
-    C = None
+    W_quant_func = _int4_symm_cutlass_quant
+    X_aqt = X_quant_func(X_ref)
+    W_aqt = W_quant_func(W_ref)
 
-    return A, A_scale, B, B_scale, C
+    Xq = X_aqt.tensor_impl.int_data
+    X_scale = X_aqt.tensor_impl.scale
+    Wq = W_aqt.tensor_impl.int_data
+    W_scale = W_aqt.tensor_impl.scale
+    bias = None
+    out_dtype = dtype
 
+    return (X_ref, W_ref), (Xq, X_scale, Wq, W_scale, bias, out_dtype)
 
-def benchmark(m: int, k: int, n: int):
-    dev = torch.device("cuda")
-    A_ref = torch.randn((m, k), dtype=torch.half, device=dev)
-    B_ref = torch.randn((n, k), dtype=torch.half, device=dev)
-    fp16_time = benchmark_microseconds(torch.nn.functional.linear, A_ref, B_ref)
 
-    A, A_scale, B, B_scale, C = get_problem(m, n, k, 8, 4)
-    rowwise_scaled_linear_cutlass_s8s4_time = benchmark_microseconds(
-        rowwise_scaled_linear_cutlass_s8s4, A, A_scale, B, B_scale, C
+def benchmark(m: int, k: int, n: int):
+    ref_args, args = get_problem(m, n, k, 4)
+    fp16_time = benchmark_microseconds(torch.nn.functional.linear, *ref_args)
+    rowwise_scaled_linear_cutlass_s4s4_time = benchmark_microseconds(
+        rowwise_scaled_linear_cutlass_s4s4, *args
     )
 
-    A, A_scale, B, B_scale, C = get_problem(m, n, k, 4, 4)
-    rowwise_scaled_linear_cutlass_s4s4_time = benchmark_microseconds(
-        rowwise_scaled_linear_cutlass_s4s4, A, A_scale, B, B_scale, C
+    _, args = get_problem(m, n, k, 8)
+    rowwise_scaled_linear_cutlass_s8s4_time = benchmark_microseconds(
+        rowwise_scaled_linear_cutlass_s8s4, *args
     )
 
     return {
 
@@ -0,0 +1,72 @@
+import pandas as pd
+import torch
+from tqdm import tqdm
+from triton.testing import do_bench
+
+from torchao.ops import rowwise_scaled_linear_sparse_cutlass_f8f8
+from torchao.quantization.quant_api import (
+    _float8_cutlass_quant,
+    _float8_cutlass_quant_sparse,
+)
+from torchao.sparsity.utils import create_semi_structured_tensor
+
+dtype = torch.bfloat16
+dtypeq_X = torch.float8_e5m2
+dtypeq_W = torch.float8_e4m3fn
+device = torch.device("cuda")
+
+
+def benchmark_microseconds(f, *args):
+    return do_bench(lambda: f(*args), return_mode="median") * 1e3
+
+
+def get_problem(m: int, n: int, k: int):
+    X_ref = torch.randn((m, k), dtype=dtype, device=device)
+    W_ref = create_semi_structured_tensor(n, k, dtype=dtype).to(device)
+
+    X_quant_func = _float8_cutlass_quant
+    W_quant_func = _float8_cutlass_quant_sparse
+    X_aqt = X_quant_func(X_ref, dtypeq_X)
+    W_aqt = W_quant_func(W_ref, dtypeq_W)
+
+    Xq = X_aqt.tensor_impl.float8_data
+    X_scale = X_aqt.tensor_impl.scale
+    Wq_sparse = W_aqt.tensor_impl.sparse
+    W_meta = W_aqt.tensor_impl.meta
+    W_scale = W_aqt.tensor_impl.scale
+    bias = None
+    out_dtype = dtype
+
+    return (X_ref, W_ref), (Xq, X_scale, Wq_sparse, W_meta, W_scale, bias, out_dtype)
+
+
+def benchmark(m: int, k: int, n: int):
+    ref_args, args = get_problem(m, n, k)
+    fp16_time = benchmark_microseconds(torch.nn.functional.linear, *ref_args)
+    rowwise_scaled_linear_sparse_cutlass_f8f8_time = benchmark_microseconds(
+        rowwise_scaled_linear_sparse_cutlass_f8f8, *args
+    )
+
+    return {
+        "m": m,
+        "k": k,
+        "n": n,
+        "fp16_latency (ms)": fp16_time,
+        "rowwise_scaled_linear_sparse_cutlass_f8f8 latency (ms)": rowwise_scaled_linear_sparse_cutlass_f8f8_time,
+        "f8f8 speedup (d/s)": fp16_time
+        / rowwise_scaled_linear_sparse_cutlass_f8f8_time,
+    }
+
+
+if __name__ == "__main__":
+    k_vals = (8192, 8192, 8192, 28672)
+    n_vals = (8192, 10240, 57344, 8192)
+
+    results = []
+    for m in tqdm([1 << i for i in range(10)]):
+        for n, k in zip(n_vals, k_vals):
+            results.append(benchmark(m, k, n))
+
+    df = pd.DataFrame(results)
+    df.to_csv("rowwise_scaled_linear_sparse_cutlass_time_results.csv", index=False)
+    print(df.to_markdown(index=False))
@@ -184,8 +184,11 @@ def get_gemm_times(
         elif float8_recipe_name in ("rowwise", "rowwise_with_gw_hp"):
             scale_a = torch.ones(M, 1, device=device)
             scale_b = torch.ones(1, N, device=device)
+        elif mx_recipe_name == "mxfp8_cublas":
+            scale_a = torch.ones(M, K // 32, device=device, dtype=torch.float8_e8m0fnu)
+            scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
         else:
-            assert False, "TODO add mx gemm here"
+            assert False, "TODO add cutlass mx gemm here"
 
         def do_matmul(A, B):
             return torch._scaled_mm(
 
@@ -4,15 +4,15 @@ The `float8_training_benchmark.sh` script in this directory can be used to launc
 
 ## Usage
 
-Example: `TORCHTITAN_ROOT=${HOME}/torchtitan FLOAT8_RECIPE=rowwise ./float8_training_benchmark.sh`
+Example: `TORCHTITAN_ROOT=${HOME}/torchtitan FLOAT8_RECIPE_WITH_BEST_SETTINGS=rowwise ./float8_training_benchmark.sh`
 
 Training parameters can be configured via environment variables.
 
 - Required:
-    - `TORCHTITAN_ROOT`
+    - `TORCHTITAN_ROOT`: Root directory of torchtitan in your local filesystem
 - Optional:
-    - `RECIPE`: rowwise|tensorwise. defaults to tensorwise.
-    - `BATCH_SIZE`: defaults to 1.
-    - `STEPS`: defaults to 100.
+    - `FLOAT8_RECIPE_WITH_BEST_SETTINGS`: "rowwise" or "tensorwise". Applies float8 training with the specified scaling recipe, as well as additional training configs which are optimal for that scaling recipe. See `float8_training_benchmark.sh` for more details.
+    - `BATCH_SIZE`: Defaults to 1.
+    - `STEPS`: Defaults to 100.
 
 **NOTE**: `torch.compile` and FSDP2 are always used. Other forms of parallelism supported in torchtitan are not yet supported in this script.
@@ -22,14 +22,13 @@
 from torch.utils.checkpoint import checkpoint
 from tqdm import tqdm
 
-from torchao import quantize_
+from torchao import optim, quantize_
 from torchao._models.llama.model import (
     ModelArgs,
     RMSNorm,
     Transformer,
     transformer_configs,
 )
-from torchao.prototype import low_bit_optim
 from torchao.prototype.quantized_training import (
     bitnet_training,
     int8_mixed_precision_training,
@@ -190,10 +189,10 @@ def insert_rmsnorm(module: torch.nn.Module):
     print(f"No. of buffers: {sum(p.numel() for p in model.buffers()):,}")
     torch.cuda.reset_peak_memory_stats()  # don't count memory occupied by unquantized weights
 
-    # only use optimizers from torchao.prototype.low_bit_optim to support quantized training
+    # only use optimizers from torchao.optim to support quantized training
     if args.optim == "AdamW":
         args.optim = "_AdamW"
-    optim = getattr(low_bit_optim, args.optim)(
+    optimizer = getattr(optim, args.optim)(
         model.parameters(),
         lr=args.lr,
         weight_decay=args.weight_decay,
@@ -228,15 +227,15 @@ def insert_rmsnorm(module: torch.nn.Module):
         if step % args.log_interval == 0:
             log_dict = dict(
                 loss=loss.item(),
-                lr=optim.param_groups[0]["lr"],
+                lr=optimizer.param_groups[0]["lr"],
                 max_memory_allocated=torch.cuda.max_memory_allocated() / 1e9,
                 max_memory_reserved=torch.cuda.max_memory_reserved() / 1e9,
             )
             run.log(log_dict, step=step)
             pbar.set_postfix(loss=log_dict["loss"])
 
-        optim.step()
-        optim.zero_grad()
+        optimizer.step()
+        optimizer.zero_grad()
 
         step += 1
         pbar.update()
 
@@ -28,6 +28,7 @@ Layouts and Tensor Subclasses
     MarlinQQQLayout
     Int4CPULayout
     CutlassInt4PackedLayout
+    CutlassSemiSparseLayout
 
 Quantization techniques
 -----------------------