Refactor torchao.prototype.parq.quant.quant_api

lisjin · lisjin · commit d593e74676b8 · 2025-09-08T07:47:58.000-07:00
diff --git a/test/prototype/test_dynamic_activation_lut.py b/test/prototype/test_dynamic_activation_lut.py
@@ -14,8 +14,10 @@
 import torch.nn as nn
 
 from torchao.core.config import AOBaseConfig
-from torchao.prototype.parq.quant import StretchedUnifTorchaoQuantizer
-from torchao.prototype.parq.quant.quant_api import StretchedIntxWeightOnlyConfig
+from torchao.prototype.parq.quant import (
+    StretchedIntxWeightOnlyConfig,
+    StretchedUnifTorchaoQuantizer,
+)
 from torchao.prototype.quantization.dynamic_activation_lut import (
     StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig,
 )
diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
@@ -21,12 +21,12 @@
     Int4UnifTorchaoQuantizer,
     LSBQuantizer,
     Quantizer,
+    StretchedIntxWeightOnlyConfig,
     StretchedUnifTorchaoQuantizer,
     TernaryUnifQuantizer,
     UnifQuantizer,
     UnifTorchaoQuantizer,
 )
-from torchao.prototype.parq.quant.quant_api import StretchedIntxWeightOnlyConfig
 from torchao.prototype.parq.quant.uniform_torchao import _BIT_WIDTH_TO_DTYPE
 from torchao.quantization.granularity import PerGroup
 from torchao.quantization.qat import IntxFakeQuantizeConfig, QATConfig
diff --git a/torchao/prototype/parq/optim/quantopt.py b/torchao/prototype/parq/optim/quantopt.py
@@ -13,25 +13,9 @@
 from torch import Tensor, nn
 from torch.optim import Optimizer
 
-from torchao.quantization import (
-    Int4WeightOnlyConfig,
-    Int8DynamicActivationIntxWeightConfig,
-    IntxWeightOnlyConfig,
-    MappingType,
-    PerGroup,
-    PerRow,
-    quantize_,
-)
-from torchao.quantization.quantize_.common import PackingFormat
-
-from ..quant import Quantizer
-from ..quant.quant_api import StretchedIntxWeightOnlyConfig
-from ..quant.uniform_torchao import (
-    _BIT_WIDTH_TO_DTYPE,
-    Int4UnifTorchaoQuantizer,
-    StretchedUnifTorchaoQuantizer,
-    UnifTorchaoQuantizer,
-)
+from torchao.quantization import quantize_
+
+from ..quant import Quantizer, UnifTorchaoQuantizer, get_config_from_quantizer
 from ..utils import HAS_DTENSOR, is_dtensor
 from .proxmap import ProxMap
 
@@ -109,6 +93,23 @@ def __repr__(self) -> str:
     def state(self) -> defaultdict[Tensor, Any]:  # pyre-ignore[3]
         return self._state if hasattr(self, "_state") else self.base_optimizer.state
 
+    @property
+    def num_steps(self) -> int:
+        for group in self.regularized_param_groups():
+            return group.setdefault("num_steps", 0)
+
+    @num_steps.setter
+    def num_steps(self, value: int) -> None:
+        for group in self.regularized_param_groups():
+            group["num_steps"] = value
+            return
+
+    @num_steps.deleter
+    def num_steps(self) -> None:
+        for group in self.regularized_param_groups():
+            group.pop("num_steps", None)
+            return
+
     @staticmethod
     def quantize_(
         p: Tensor,
@@ -165,40 +166,15 @@ def torchao_convert(self, model: nn.Module) -> None:
             if not isinstance(quantizer, UnifTorchaoQuantizer):
                 continue
 
-            weight_dtype = _BIT_WIDTH_TO_DTYPE[group["quant_bits"]]
-            granularity = (
-                PerGroup(group["quant_block_size"])
-                if "quant_block_size" in group
-                else PerRow()
+            device = group["params"][0].device
+            is_embed = all(p.data_ptr() in embed_data_ptrs for p in group["params"])
+            config = get_config_from_quantizer(
+                quantizer,
+                is_embed,
+                device,
+                group["quant_bits"],
+                group.get("quant_block_size"),
             )
-            version = 2
-            if isinstance(quantizer, Int4UnifTorchaoQuantizer):
-                config = Int4WeightOnlyConfig(group_size=group["quant_block_size"])
-            elif isinstance(quantizer, StretchedUnifTorchaoQuantizer):
-                config = StretchedIntxWeightOnlyConfig(
-                    b=group["quant_bits"],
-                    quant_min=quantizer.quant_min,
-                    quant_max=quantizer.quant_max,
-                    granularity=granularity,
-                    version=version,
-                )
-            elif all(p.data_ptr() in embed_data_ptrs for p in group["params"]):
-                config = IntxWeightOnlyConfig(
-                    weight_dtype=weight_dtype,
-                    granularity=granularity,
-                    mapping_type=quantizer.mapping_type,
-                    packing_format=PackingFormat.UNPACKED_TO_INT8,
-                    version=version,
-                )
-            else:
-                config = Int8DynamicActivationIntxWeightConfig(
-                    weight_dtype=weight_dtype,
-                    weight_granularity=granularity,
-                    weight_mapping_type=quantizer.mapping_type,
-                    act_mapping_type=MappingType.ASYMMETRIC,
-                    packing_format=PackingFormat.UNPACKED_TO_INT8,
-                    version=version,
-                )
             quantize_(model, config, filter_fn=filter_fn)
 
     @torch._disable_dynamo
diff --git a/torchao/prototype/parq/quant/__init__.py b/torchao/prototype/parq/quant/__init__.py
@@ -4,6 +4,10 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+from .config_torchao import (  # noqa: F401
+    StretchedIntxWeightOnlyConfig,
+    get_config_from_quantizer,
+)
 from .lsbq import LSBQuantizer  # noqa: F401
 from .quantizer import Quantizer  # noqa: F401
 from .uniform import (  # noqa: F401
diff --git a/torchao/prototype/parq/quant/config_torchao.py b/torchao/prototype/parq/quant/config_torchao.py
@@ -0,0 +1,141 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from torchao.core.config import AOBaseConfig
+from torchao.dtypes import Int4CPULayout
+from torchao.quantization import MappingType, PerAxis, PerGroup
+from torchao.quantization.quant_api import (
+    Int4WeightOnlyConfig,
+    Int8DynamicActivationIntxWeightConfig,
+    IntxWeightOnlyConfig,
+)
+from torchao.quantization.quantize_.common.packing_format import PackingFormat
+from torchao.quantization.quantize_.workflows import IntxUnpackedToInt8Tensor
+from torchao.quantization.transform_module import register_quantize_module_handler
+from torchao.utils import check_cpu_version
+
+from .quant_api import (
+    choose_qparams_stretched_affine,
+    quantize_stretched_affine,
+    to_stretched_affine_quantized_intx,
+)
+from .uniform_torchao import (
+    _BIT_WIDTH_TO_DTYPE,
+    Int4UnifTorchaoQuantizer,
+    StretchedUnifTorchaoQuantizer,
+)
+
+
+@dataclass
+class StretchedIntxWeightOnlyConfig(IntxWeightOnlyConfig):
+    b: Optional[int] = None
+    quant_min: Optional[int] = None
+    quant_max: Optional[int] = None
+    activation_quantization: Optional[str] = "int8_asym_per_token"
+
+
+@register_quantize_module_handler(StretchedIntxWeightOnlyConfig)
+def _stretched_intx_weight_only_transform(
+    module: torch.nn.Module, config: StretchedIntxWeightOnlyConfig
+) -> torch.nn.Module:
+    weight = module.weight
+    granularity = config.granularity
+    mapping_type = MappingType.ASYMMETRIC
+
+    assert weight.dim() == 2, (
+        f"StretchedIntxWeightOnlyConfig only works for 2-d Tensor, got: {weight.dim()}"
+    )
+    if isinstance(granularity, PerGroup):
+        group_size = granularity.group_size
+    elif isinstance(granularity, PerAxis):
+        assert granularity.axis == 0, (
+            f"axis must be 0 with PerAxis, but got {granularity.axis}"
+        )
+        group_size = weight.shape[-1]
+    else:
+        raise ValueError(f"granularity must be PerGroup or PerAxis, got {granularity}")
+
+    block_size = (1, group_size)
+    target_dtype = torch.int8
+    q_args = (weight, mapping_type, block_size, target_dtype, config.b)
+    if config.version == 2:
+        scale, zero_point = choose_qparams_stretched_affine(
+            *q_args,
+            quant_min=config.quant_min,
+            quant_max=config.quant_max,
+        )
+        qdata = quantize_stretched_affine(
+            weight,
+            block_size,
+            scale,
+            zero_point,
+            target_dtype,
+            quant_min=config.quant_min,
+            quant_max=config.quant_max,
+        )
+        n_blocks = [qdata.shape[i] // block_size[i] for i in range(len(block_size))]
+        scale = scale.reshape(*n_blocks)
+        zero_point = zero_point.reshape(*n_blocks)
+
+        weight = IntxUnpackedToInt8Tensor(
+            qdata=qdata,
+            scale=scale,
+            zero_point=zero_point,
+            target_dtype=getattr(torch, f"int{config.b}"),
+            block_size=block_size,
+            dtype=weight.dtype,
+            activation_quantization=config.activation_quantization,
+        )
+    else:
+        weight = to_stretched_affine_quantized_intx(
+            *q_args,
+            quant_min=config.quant_min,
+            quant_max=config.quant_max,
+            scale_dtype=config.scale_dtype,
+            _layout=config.layout,
+        )
+    module.weight = torch.nn.Parameter(weight, requires_grad=False)
+    return module
+
+
+def get_config_from_quantizer(
+    quantizer,
+    is_embed: bool,
+    device: torch.device,
+    b: int,
+    block_size: Optional[int],
+    version: int = 2,
+) -> AOBaseConfig:
+    granularity = PerGroup(block_size) if block_size is not None else PerAxis(0)
+    weight_dtype = _BIT_WIDTH_TO_DTYPE[b]
+    if isinstance(quantizer, Int4UnifTorchaoQuantizer):
+        kwargs = {"layout": Int4CPULayout()} if check_cpu_version(device) else {}
+        config = Int4WeightOnlyConfig(group_size=block_size, **kwargs)
+    elif isinstance(quantizer, StretchedUnifTorchaoQuantizer):
+        config = StretchedIntxWeightOnlyConfig(
+            b=b,
+            quant_min=quantizer.quant_min,
+            quant_max=quantizer.quant_max,
+            granularity=granularity,
+            version=version,
+        )
+    elif is_embed:
+        config = IntxWeightOnlyConfig(
+            weight_dtype=weight_dtype,
+            granularity=granularity,
+            mapping_type=quantizer.mapping_type,
+            packing_format=PackingFormat.UNPACKED_TO_INT8,
+            version=version,
+        )
+    else:
+        config = Int8DynamicActivationIntxWeightConfig(
+            weight_dtype=weight_dtype,
+            weight_granularity=granularity,
+            weight_mapping_type=quantizer.mapping_type,
+            act_mapping_type=MappingType.ASYMMETRIC,
+            packing_format=PackingFormat.UNPACKED_TO_INT8,
+            version=version,
+        )
+    return config
diff --git a/torchao/prototype/parq/quant/quant_api.py b/torchao/prototype/parq/quant/quant_api.py
@@ -8,23 +8,20 @@
 from typing import Optional, Tuple, Union
 
 import torch
-from torch import nn
 
 from torchao.dtypes import AffineQuantizedTensor, Layout, QDQLayout
 from torchao.quantization import (
     MappingType,
-    PerAxis,
-    PerGroup,
     ZeroPointDomain,
     dequantize_affine,
 )
-from torchao.quantization.quant_api import IntxWeightOnlyConfig
+from torchao.quantization.quant_api import (
+    IntxWeightOnlyConfig,
+)
 from torchao.quantization.quant_primitives import (
     _SUB_BYTE_UINT_BOUNDS,
     _get_reduction_params,
 )
-from torchao.quantization.quantize_.workflows import IntxUnpackedToInt8Tensor
-from torchao.quantization.transform_module import register_quantize_module_handler
 
 
 def choose_qparams_stretched_affine(
@@ -188,67 +185,3 @@ class StretchedIntxWeightOnlyConfig(IntxWeightOnlyConfig):
     quant_min: Optional[int] = None
     quant_max: Optional[int] = None
     activation_quantization: Optional[str] = "int8_asym_per_token"
-
-
-@register_quantize_module_handler(StretchedIntxWeightOnlyConfig)
-def _stretched_intx_weight_only_transform(
-    module: nn.Module, config: StretchedIntxWeightOnlyConfig
-) -> nn.Module:
-    weight = module.weight
-    granularity = config.granularity
-    mapping_type = MappingType.ASYMMETRIC
-
-    assert weight.dim() == 2, (
-        f"StretchedIntxWeightOnlyConfig only works for 2-d Tensor, got: {weight.dim()}"
-    )
-    if isinstance(granularity, PerGroup):
-        group_size = granularity.group_size
-    elif isinstance(granularity, PerAxis):
-        assert granularity.axis == 0, (
-            f"axis must be 0 with PerAxis, but got {granularity.axis}"
-        )
-        group_size = weight.shape[-1]
-    else:
-        raise ValueError(f"granularity must be PerGroup or PerAxis, got {granularity}")
-
-    block_size = (1, group_size)
-    target_dtype = torch.int8
-    q_args = (weight, mapping_type, block_size, target_dtype, config.b)
-    if config.version == 2:
-        scale, zero_point = choose_qparams_stretched_affine(
-            *q_args,
-            quant_min=config.quant_min,
-            quant_max=config.quant_max,
-        )
-        qdata = quantize_stretched_affine(
-            weight,
-            block_size,
-            scale,
-            zero_point,
-            target_dtype,
-            quant_min=config.quant_min,
-            quant_max=config.quant_max,
-        )
-        n_blocks = [qdata.shape[i] // block_size[i] for i in range(len(block_size))]
-        scale = scale.reshape(*n_blocks)
-        zero_point = zero_point.reshape(*n_blocks)
-
-        weight = IntxUnpackedToInt8Tensor(
-            qdata=qdata,
-            scale=scale,
-            zero_point=zero_point,
-            target_dtype=getattr(torch, f"int{config.b}"),
-            block_size=block_size,
-            dtype=weight.dtype,
-            activation_quantization=config.activation_quantization,
-        )
-    else:
-        weight = to_stretched_affine_quantized_intx(
-            *q_args,
-            quant_min=config.quant_min,
-            quant_max=config.quant_max,
-            scale_dtype=config.scale_dtype,
-            _layout=config.layout,
-        )
-    module.weight = torch.nn.Parameter(weight, requires_grad=False)
-    return module
diff --git a/torchao/prototype/parq/quant/uniform_torchao.py b/torchao/prototype/parq/quant/uniform_torchao.py
@@ -27,10 +27,7 @@
     quantize_affine,
 )
 
-from .quant_api import (
-    choose_qparams_stretched_affine,
-    quantize_stretched_affine,
-)
+from .quant_api import choose_qparams_stretched_affine, quantize_stretched_affine
 from .quantizer import Quantizer
 
 _BIT_WIDTH_TO_DTYPE = {v: k for k, v in _DTYPE_TO_BIT_WIDTH.items()}