Add convert path for quantize_ QAT API

andrewor14 · andrewor14 · commit 6b65cc735da5 · 2025-01-10T11:55:09.000-08:00
Summary: #1415 added a quantize_ QAT API for the prepare path. This commit adds the remaining convert path for users to actually perform end-to-end QAT using the quantize_ API. The new flow will look like: ``` from torchao.quantization import ( quantize_, int8_dynamic_activation_int4_weight, ) from torchao.quantization.qat import ( FakeQuantizeConfig, from_intx_quantization_aware_training, intx_quantization_aware_training, ) activation_config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False) weight_config = FakeQuantizeConfig(torch.int4, group_size=32) quantize_( my_model, intx_quantization_aware_training(activation_config, weight_config), ) quantize_(my_model, from_intx_quantization_aware_training()) quantize_(my_model, int8_dynamic_activation_int4_weight(group_size=32)) ``` Test Plan: python test/quantization/test_qat.py -k test_quantize_api_convert_path ghstack-source-id: e6ea042 Pull Request resolved: #1540
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -25,6 +25,7 @@
 from torchao.quantization.qat.api import (
     ComposableQATQuantizer,
     FakeQuantizeConfig,
+    from_intx_quantization_aware_training,
     intx_quantization_aware_training,
 )
 from torchao.quantization.qat.embedding import (
@@ -42,6 +43,9 @@
     _GenericFakeQuantize,
     _get_qmin_qmax,
 )
+from torchao.quantization.quant_api import (
+    int8_dynamic_activation_int4_weight,
+)
 from torchao.quantization.quant_primitives import (
     MappingType,
     TorchAODType,
@@ -1262,6 +1266,67 @@ def test_quantize_api_errors(self):
                 lambda m, _: isinstance(m, torch.nn.ReLU),
             )
 
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_quantize_api_convert_path(self):
+        """
+        Test that the following:
+
+            quantize_(model, intx_quantization_aware_training(...))
+            quantize_(model, from_intx_quantization_aware_training(...))
+            quantize_(model, int8_dynamic_activation_int4_weight())
+
+        can produce the same results as `Int8DynActInt4WeightQATQuantizer` prepare + convert.
+        """
+        from torchao.quantization.qat import (
+            Int8DynActInt4WeightQATQuantizer,
+        )
+
+        group_size = 16
+        torch.manual_seed(self.SEED)
+        m = M()
+        baseline_model = copy.deepcopy(m)
+
+        # Baseline prepare
+        baseline_quantizer = Int8DynActInt4WeightQATQuantizer(groupsize=group_size)
+        baseline_model = baseline_quantizer.prepare(baseline_model)
+
+        # quantize_ prepare
+        activation_config = FakeQuantizeConfig(
+            torch.int8,
+            "per_token",
+            is_symmetric=False,
+        )
+        weight_config = FakeQuantizeConfig(TorchAODType.INT4, group_size=group_size)
+        quantize_(
+            m,
+            intx_quantization_aware_training(activation_config, weight_config),
+        )
+
+        # Compare prepared values
+        torch.manual_seed(self.SEED)
+        x = m.example_inputs()
+        x2 = copy.deepcopy(x)
+        out = m(*x)
+        baseline_out = baseline_model(*x2)
+        torch.testing.assert_close(out, baseline_out, atol=0, rtol=0)
+
+        # Baseline convert
+        baseline_model = baseline_quantizer.convert(baseline_model)
+
+        # quantize_ convert
+        quantize_(m, from_intx_quantization_aware_training())
+        quantize_(m, int8_dynamic_activation_int4_weight(group_size=group_size))
+
+        # Compare converted values
+        torch.manual_seed(self.SEED)
+        x = m.example_inputs()
+        x2 = copy.deepcopy(x)
+        out = m(*x)
+        baseline_out = baseline_model(*x2)
+        torch.testing.assert_close(out, baseline_out, atol=0, rtol=0)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/qat/__init__.py b/torchao/quantization/qat/__init__.py
@@ -1,6 +1,7 @@
 from .api import (
     ComposableQATQuantizer,
     FakeQuantizeConfig,
+    from_intx_quantization_aware_training,
     intx_quantization_aware_training,
 )
 from .embedding import (
@@ -18,4 +19,5 @@
     "Int4WeightOnlyEmbeddingQATQuantizer",
     "Int8DynActInt4WeightQATQuantizer",
     "intx_quantization_aware_training",
+    "from_intx_quantization_aware_training",
 ]
diff --git a/torchao/quantization/qat/api.py b/torchao/quantization/qat/api.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
-from typing import Any, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 
 import torch
 
@@ -242,7 +242,7 @@ def __setattr__(self, name: str, value: Any):
 def intx_quantization_aware_training(
     activation_config: Optional[FakeQuantizeConfig] = None,
     weight_config: Optional[FakeQuantizeConfig] = None,
-) -> torch.nn.Module:
+) -> Callable:
     """
     Return a function that applies fake quantization to a `torch.nn.Module`.
     to be used with :func:`~torchao.quantization.quant_api.quantize_`.
@@ -295,6 +295,42 @@ def _insert_fake_quantize(mod: torch.nn.Module):
     return _insert_fake_quantize
 
 
+def from_intx_quantization_aware_training() -> Callable:
+    """
+    Return a function that converts a model with fake quantized modules,
+    such as :func:`~torchao.quantization.qat.linear.FakeQuantizedLinear`
+    and :func:`~torchao.quantization.qat.linear.FakeQuantizedEmbedding`,
+    back to model with the original, corresponding modules without
+    fake quantization. This should be used with
+    :func:`~torchao.quantization.quant_api.quantize_`.
+
+    Example usage::
+
+        from torchao.quantization import quantize_
+        quantize_(
+            model_with_fake_quantized_linears,
+            from_intx_quantization_aware_training(),
+        )
+    """
+
+    def _remove_fake_quantize(mod: torch.nn.Module):
+        """
+        If the given module is a fake quantized module, return the original
+        corresponding version of the module without fake quantization.
+        """
+        from .embedding import FakeQuantizedEmbedding
+        from .linear import FakeQuantizedLinear
+
+        if isinstance(mod, FakeQuantizedLinear):
+            return mod.to_linear()
+        elif isinstance(mod, FakeQuantizedEmbedding):
+            return mod.to_embedding()
+        else:
+            return mod
+
+    return _remove_fake_quantize
+
+
 class ComposableQATQuantizer(TwoStepQuantizer):
     """
     Composable quantizer that users can use to apply multiple QAT quantizers easily.
diff --git a/torchao/quantization/qat/embedding.py b/torchao/quantization/qat/embedding.py
@@ -82,6 +82,24 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self.sparse,
         )
 
+    def to_embedding(self) -> torch.nn.Embedding:
+        new_embedding = torch.nn.Embedding(
+            self.num_embeddings,
+            self.embedding_dim,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+            device=self.weight.device,
+        )
+        # In distributed training, the model may be instantiated
+        # on the meta device, in which case there is no need to
+        # copy the weights, and doing so will result in an error
+        if self.weight.device != torch.device("meta"):
+            new_embedding.weight = self.weight
+        return new_embedding
+
     @classmethod
     def from_embedding(
         cls,
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -105,6 +105,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             w = self.weight
         return F.linear(x, w)
 
+    def to_linear(self) -> torch.nn.Linear:
+        new_linear = torch.nn.Linear(
+            self.in_features, self.out_features, self.bias, device=self.weight.device
+        )
+        # In distributed training, the model may be instantiated
+        # on the meta device, in which case there is no need to
+        # copy the weights, and doing so will result in an error
+        if self.weight.device != torch.device("meta"):
+            new_linear.weight = self.weight
+        return new_linear
+
     @classmethod
     def from_linear(
         cls,

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`from .api import (`
`2`	`2`	`ComposableQATQuantizer,`
`3`	`3`	`FakeQuantizeConfig,`
	`4`	`+ from_intx_quantization_aware_training,`
`4`	`5`	`intx_quantization_aware_training,`
`5`	`6`	`)`
`6`	`7`	`from .embedding import (`
`@@ -18,4 +19,5 @@`
`18`	`19`	`"Int4WeightOnlyEmbeddingQATQuantizer",`
`19`	`20`	`"Int8DynActInt4WeightQATQuantizer",`
`20`	`21`	`"intx_quantization_aware_training",`
	`22`	`+ "from_intx_quantization_aware_training",`
`21`	`23`	`]`