Update

vkuzo · vkuzo · commit 6f3d1278e334 · 2025-03-08T06:15:24.000-08:00
[ghstack-poisoned]
diff --git a/test/prototype/test_smoothquant.py b/test/prototype/test_smoothquant.py
@@ -5,11 +5,11 @@
 import torch
 
 from torchao.prototype.smoothquant import (
+    SmoothQuantConfig,
     SmoothQuantObservedLinear,
     insert_smooth_quant_observer_,
     load_smooth_quant_recipe,
     save_smooth_quant_recipe,
-    smooth_quant,
 )
 from torchao.quantization import quantize_
 from torchao.quantization.utils import (
@@ -85,7 +85,7 @@ def forward(self, x):
     m(data)
     # quantize
     is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
-    quantize_(m, smooth_quant(), is_observed_linear)
+    quantize_(m, SmoothQuantConfig(), is_observed_linear)
     with torch.inference_mode():
         if TORCH_VERSION_AT_LEAST_2_5:
             m = torch.compile(m, fullgraph=True)
@@ -173,7 +173,7 @@ def test_save_load_recipe(alpha, quant_mode, device, idtype):
 
     # quantize
     is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
-    quantize_(m, smooth_quant(), is_observed_linear)
+    quantize_(m, SmoothQuantConfig(), is_observed_linear)
     if TORCH_VERSION_AT_LEAST_2_5:
         # earlier versions are not compatible
         m = torch.compile(m, fullgraph=True)
diff --git a/torchao/prototype/smoothquant/README.md b/torchao/prototype/smoothquant/README.md
@@ -27,7 +27,7 @@ python example.py -m MODLE_ID --device=<cuda or cpu> --quant-mode=<dynamic or st
 ## Usage of API
 The following APIs are provided:
 - insert_smooth_quant_observer_
-- smooth_quant
+- SmoothQuantConfig
 - save_smooth_quant_recipe (advanced)
 - load_smooth_quant_recipe (advanced)
 
@@ -37,11 +37,11 @@ insert_smooth_quant_observer_(model, alpha=0.5, quant_mode="dynamic")
 ```
 After insertion, run the model for calibration on a certain dataset or (advanced) load a recipe.
 
-`smooth_quant` applies SmoothQuant to each linear layer of the model. Use it by calling `torchao.quantization.quantize_`. For example:
+`SmoothQuantConfig` configures appliying SmoothQuant to each linear layer of the model. Use it by calling `torchao.quantization.quantize_`. For example:
 ```python
 from torchao.prototype.smoothquant import SmoothQuantObservedLinear
 is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
-torchao.quantization.quantize_(model, smooth_quant(), is_observed_linear)
+torchao.quantization.quantize_(model, SmoothQuantConfig(), is_observed_linear)
 ```
 `is_observed_linear` is a filter so that we only quantize observed linear layers.
 
diff --git a/torchao/prototype/smoothquant/__init__.py b/torchao/prototype/smoothquant/__init__.py
@@ -1,15 +1,15 @@
 from .api import (
+    SmoothQuantConfig,
     insert_smooth_quant_observer_,
     load_smooth_quant_recipe,
     save_smooth_quant_recipe,
-    smooth_quant,
 )
 from .core import SmoothQuantObservedLinear
 
 __all__ = [
     "insert_smooth_quant_observer_",
     "load_smooth_quant_recipe",
     "save_smooth_quant_recipe",
-    "smooth_quant",
+    "SmoothQuantConfig",
     "SmoothQuantObservedLinear",
 ]
diff --git a/torchao/prototype/smoothquant/api.py b/torchao/prototype/smoothquant/api.py
@@ -109,7 +109,7 @@ def recurse(module: torch.nn.Module, name: str = ""):
             wrapper = torch.nn.Sequential(module)
             quantize_(
                 wrapper,
-                smooth_quant(smoothing_factor, act_scales, wei_scales),
+                SmoothQuantConfig(smoothing_factor, act_scales, wei_scales),
                 is_observed_linear,
             )
             return wrapper[0]
@@ -165,10 +165,6 @@ class SmoothQuantConfig(AOBaseConfig):
     wei_scales: Optional[torch.Tensor] = None
 
 
-# for bc
-smooth_quant = SmoothQuantConfig
-
-
 @register_quantize_module_handler(SmoothQuantConfig)
 def _smooth_quant_transform(
     module: torch.nn.Module,
@@ -177,7 +173,6 @@ def _smooth_quant_transform(
     smoothing_factor = config.smoothing_factor
     act_scales = config.act_scales
     wei_scales = config.wei_scales
-    # weight = module.weight
     observed_linear = module
 
     linear = torch.nn.Linear(
@@ -187,11 +182,7 @@ def _smooth_quant_transform(
         device=observed_linear.weight.device,
         dtype=observed_linear.weight.dtype,
     )
-    # linear.weight = torch.nn.Parameter(
-    #     constructor(observed_linear), requires_grad=False
-    # )
     linear.bias = observed_linear.bias
-    # return linear
 
     target_dtype = torch.int8
     # act_scales is None for dynamic quantization thus not checked
diff --git a/torchao/prototype/smoothquant/example.py b/torchao/prototype/smoothquant/example.py
@@ -9,9 +9,9 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from torchao.prototype.smoothquant import (
+    SmoothQuantConfig,
     SmoothQuantObservedLinear,
     insert_smooth_quant_observer_,
-    smooth_quant,
 )
 from torchao.quantization import quantize_
 
@@ -145,7 +145,7 @@ def wikitext2_ppl(
         is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
         print(f"running SmoothQuant with {quant_mode} quantization")
         t0 = time.time()
-        quantize_(model, smooth_quant(), is_observed_linear)
+        quantize_(model, SmoothQuantConfig(), is_observed_linear)
         print(f"time for quantization: {time.time() - t0:.02f} seconds")
         if model_save_path is not None:
             print(f"Saving quantized model to {model_save_path}")