Add Float8ActInt4WeightQATQuantizer

andrewor14 · andrewor14 · commit 620f676501ce · 2025-06-03T09:37:08.000-07:00
**Summary:** This commit adds a QAT quantizer that performs
float8 dynamic activation + int4 symmetric per channel weight
fake quantization. Note that there is no corresponding config
for float8 QAT yet. This will be added in a future PR.

**Test Plan:**
python test/quantization/test_qat.py -k test_float8_fake_quantize
python test/quantization/test_qat.py -k test_qat_fp8a4w_quantizer
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -17,6 +17,9 @@
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 
 from torchao import quantize_
+from torchao.float8.config import ScalingGranularity
+from torchao.float8.float8_scaling_utils import hp_tensor_to_float8_dynamic
+from torchao.float8.float8_tensor import LinearMMConfig
 from torchao.quantization.granularity import (
     PerAxis,
     PerGroup,
@@ -40,15 +43,18 @@
 )
 from torchao.quantization.qat.fake_quantizer import (
     FakeQuantizer,
+    _Float8ActivationFakeQuantizer,
 )
 from torchao.quantization.qat.linear import (
     FakeQuantizedLinear,
+    Float8ActInt4WeightQATQuantizer,
     Int4WeightOnlyQATLinear,
     Int8DynActInt4WeightQATLinear,
 )
 from torchao.quantization.qat.utils import (
     _fake_quantize_per_channel_group,
     _fake_quantize_per_token,
+    _Float8FakeQuantize,
     _GenericFakeQuantize,
     _get_qmin_qmax,
 )
@@ -69,6 +75,7 @@
 )
 from torchao.quantization.utils import (
     _get_per_token_block_size,
+    compute_error,
     get_group_qparams_symmetric,
     get_groupwise_affine_qparams,
     groupwise_affine_quantize_tensor,
@@ -1511,7 +1518,6 @@ def test_qat_8da4w_prepare_vs_convert(self, dtype: torch.dtype):
         numerics that match exactly over N trials.
         """
         from torchao.quantization.qat import Int8DynActInt4WeightQATQuantizer
-        from torchao.quantization.utils import compute_error
 
         num_trials = 1000
         group_size = 16
@@ -1711,6 +1717,66 @@ def test_qat_range_learning(self):
         loss.backward()
         optimizer.step()
 
+    @parameterized.expand([
+        (ScalingGranularity.TENSORWISE,),
+        (ScalingGranularity.AXISWISE,),
+    ])
+    def test_float8_fake_quantize(self, scaling_granularity: ScalingGranularity):
+        """
+        Test that `_Float8FakeQuantize` is numerically close to `Float8Tensor`.
+        """
+        torch.manual_seed(self.SEED)
+        dtype = torch.float8_e4m3fn
+        x = torch.randn(32, 64)
+        if scaling_granularity == ScalingGranularity.AXISWISE:
+            axiswise_dim = 0
+        else:
+            axiswise_dim = None
+        out = _Float8FakeQuantize.apply(x, dtype, scaling_granularity, axiswise_dim)
+        out_expected = hp_tensor_to_float8_dynamic(
+            x,
+            dtype, 
+            LinearMMConfig(),
+            scaling_granularity=scaling_granularity,
+            axiswise_dim=axiswise_dim,
+        ).to_original_precision()
+        torch.testing.assert_close(out, out_expected, atol=0, rtol=0)
+
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_qat_fp8a4w_quantizer(self):
+        """
+        Test basic model training with Float8ActIntWeightQATQuantizer.
+        """
+        torch.manual_seed(self.SEED)
+        m = M()
+        qat_quantizer = Float8ActInt4WeightQATQuantizer()
+        qat_model = qat_quantizer.prepare(m)
+        for linear in [m.linear1, m.sub.linear, m.linear2]:
+            self.assertIsInstance(linear, FakeQuantizedLinear)
+            self.assertIsInstance(linear.activation_fake_quantizer, _Float8ActivationFakeQuantizer)
+            self.assertIsInstance(linear.weight_fake_quantizer, FakeQuantizer)
+        prev_weight = copy.deepcopy(m.linear1.weight)
+
+        # Simulate training
+        optimizer = torch.optim.SGD(
+            m.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5
+        )
+        loss_fn = torch.nn.CrossEntropyLoss()
+        optimizer.zero_grad()
+        target = torch.randn(1, 512).float()
+        example_inputs = m.example_inputs()
+        out = m(*example_inputs)
+        loss = loss_fn(out, target)
+        loss.backward()
+        optimizer.step()
+        # Assert that weights have valid gradients and are being updated
+        new_weight = m.linear1.weight
+        self.assertIsNotNone(new_weight.grad)
+        self.assertNotEqual(torch.count_nonzero(new_weight.grad), 0)
+        self.assertFalse(torch.equal(new_weight, prev_weight))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/qat/__init__.py b/torchao/quantization/qat/__init__.py
@@ -11,13 +11,15 @@
     Int4WeightOnlyEmbeddingQATQuantizer,
 )
 from .linear import (
+    Float8ActInt4WeightQATQuantizer,
     Int4WeightOnlyQATQuantizer,
     Int8DynActInt4WeightQATQuantizer,
 )
 
 __all__ = [
     "ComposableQATQuantizer",
     "FakeQuantizeConfig",
+    "Float8ActInt4WeightQATQuantizer",
     "FromIntXQuantizationAwareTrainingConfig",
     "Int4WeightOnlyEmbeddingQATQuantizer",
     "Int4WeightOnlyQATQuantizer",
diff --git a/torchao/quantization/qat/fake_quantizer.py b/torchao/quantization/qat/fake_quantizer.py
@@ -8,6 +8,8 @@
 
 import torch
 
+from torchao.float8.config import ScalingGranularity
+from torchao.float8.float8_scaling_utils import get_maybe_axiswise_dim
 from torchao.quantization.granularity import (
     PerAxis,
     PerGroup,
@@ -31,6 +33,7 @@
 from .utils import (
     _fake_quantize_per_channel_group,
     _fake_quantize_per_token,
+    _Float8FakeQuantize,
     _Round,
 )
 
@@ -186,3 +189,27 @@ def __repr__(self) -> str:
         Return a human readable representation of this `FakeQuantizer` with config details.
         """
         return "FakeQuantizer(%s)" % self.config
+
+
+class _Float8ActivationFakeQuantizer(torch.nn.Module):
+    """
+    Simple fake quantizer for float8 fake quantization, intended for activations only.
+    """
+
+    FLOAT8_DTYPE = torch.float8_e4m3fn
+
+    def __init__(self, scaling_granularity: ScalingGranularity):
+        super().__init__()
+        self.enabled = True
+        self.scaling_granularity = scaling_granularity
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.enabled:
+            return _Float8FakeQuantize.apply(
+                x,
+                self.FLOAT8_DTYPE,
+                self.scaling_granularity,
+                get_maybe_axiswise_dim(-1, self.scaling_granularity),
+            )
+        else:
+            return x
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -10,6 +10,7 @@
 import torch.nn.functional as F
 
 from torchao.dtypes.utils import is_device
+from torchao.float8.config import ScalingGranularity
 from torchao.quantization.granularity import PerGroup
 from torchao.quantization.linear_quant_modules import (
     Int8DynActInt4WeightLinear,
@@ -28,7 +29,10 @@
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_6
 
 from .api import FakeQuantizeConfig
-from .fake_quantizer import FakeQuantizer
+from .fake_quantizer import (
+    FakeQuantizer,
+    _Float8ActivationFakeQuantizer,
+)
 from .utils import (
     _get_qmin_qmax,
 )
@@ -145,6 +149,11 @@ def from_linear(
         return new_linear
 
 
+# ===========================
+# | QAT quantizer interface |
+# ===========================
+
+
 class _LegacyQATQuantizer(TwoStepQuantizer):
     """
     Base class for sharing common methods across legacy QAT quantizers.
@@ -157,9 +166,30 @@ def get_weight_fake_quantize_config(self) -> Optional[FakeQuantizeConfig]:
         return None
 
 
-# =========================================================
-# |   Linear int8 dynamic activations + int4 weight QAT   |
-# =========================================================
+def enable_linear_fake_quant(
+    mod: torch.nn.Module,
+    enabled: bool = True,
+):
+    """
+    Helper function to enable fake quantization in `FakeQuantizerLinear`.
+    """
+    if isinstance(mod, FakeQuantizedLinear):
+        if mod.activation_fake_quantizer is not None:
+            mod.activation_fake_quantizer.enabled = enabled
+        if mod.weight_fake_quantizer is not None:
+            mod.weight_fake_quantizer.enabled = enabled
+
+
+def disable_linear_fake_quant(mod: torch.nn.Module):
+    """
+    Helper function to disable fake quantization in `FakeQuantizerLinear`.
+    """
+    enable_linear_fake_quant(mod, enabled=False)
+
+
+# ===========================================
+# | int8 dynamic activations + int4 weights |
+# ===========================================
 
 
 class Int8DynActInt4WeightQATQuantizer(_LegacyQATQuantizer):
@@ -307,6 +337,7 @@ def disable_fake_quant(self):
         self.enable_fake_quant(False)
 
 
+# TODO: remove these in favor of enable_linear_fake_quant
 def enable_8da4w_fake_quant(mod: torch.nn.Module):
     """
     Enable fake quantization for `Int8DynActInt4WeightQATLinear`.
@@ -315,6 +346,7 @@ def enable_8da4w_fake_quant(mod: torch.nn.Module):
         mod.enable_fake_quant()
 
 
+# TODO: remove in favor of disable_linear_fake_quant
 def disable_8da4w_fake_quant(mod: torch.nn.Module):
     """
     Disable fake quantization for `Int8DynActInt4WeightQATLinear`.
@@ -357,9 +389,9 @@ def _get_8da4w_weight_config(
     )
 
 
-# ===================================
-# |   Linear int4 weight-only QAT   |
-# ===================================
+# ====================
+# | int4 weight-only |
+# ====================
 
 
 class Int4WeightOnlyQATQuantizer(_LegacyQATQuantizer):
@@ -501,6 +533,7 @@ def disable_fake_quant(self):
         self.enable_fake_quant(False)
 
 
+# TODO: remove these in favor of enable_linear_fake_quant
 def enable_4w_fake_quant(mod: torch.nn.Module):
     """
     Enable fake quantization for `Int4WeightOnlyQATLinear`.
@@ -509,6 +542,7 @@ def enable_4w_fake_quant(mod: torch.nn.Module):
         mod.enable_fake_quant()
 
 
+# TODO: remove these in favor of disable_linear_fake_quant
 def disable_4w_fake_quant(mod: torch.nn.Module):
     """
     Disable fake quantization for `Int4WeightOnlyQATLinear`.
@@ -533,3 +567,72 @@ def _get_4w_weight_config(
         zero_point_precision=qparams_precision,
         zero_point_domain=ZeroPointDomain.FLOAT,
     )
+
+
+# =====================================
+# | float8 activations + int4 weights |
+# =====================================
+
+
+class Float8ActInt4WeightQATQuantizer:
+    """
+    QAT quantizer for applying dynamic float8 activation + int4
+    per channel, symmetric weight fake quantization to linear
+    layers in the model.
+
+    args:
+        activation_scaling_granularity (ScalingGranularity): float8 scaling granularity
+            for activation fake quantization, defaults to AXISWISE (per row).
+        scale_precision (torch.dtype): precision of weight scales, defaults to torch.bfloat16
+    """
+
+    def __init__(
+        self,
+        activation_scaling_granularity: ScalingGranularity = ScalingGranularity.AXISWISE,
+        scale_precision: torch.dtype = torch.bfloat16,
+    ):
+        # symmetric, so zero point precision does not matter
+        zero_point_precision = torch.float32
+        self._activation_scaling_granularity = activation_scaling_granularity
+        self._weight_config = FakeQuantizeConfig(
+            dtype=torch.int4,
+            granularity="per_channel",
+            is_symmetric=True,
+            is_dynamic=True,
+            scale_precision=scale_precision,
+            zero_point_precision=zero_point_precision,
+        )
+
+    def prepare(
+        self, model: torch.nn.Module, *args: Any, **kwargs: Any
+    ) -> torch.nn.Module:
+        """
+        Swap all `nn.Linear` with `FakeQuantizedLinear` with float8
+        fake quantizer for activations and int4 fake quantizer for weights.
+        """
+        for name, child in model.named_children():
+            if isinstance(child, torch.nn.Linear):
+                # TODO: add a config for float8?
+                new_linear = FakeQuantizedLinear.from_linear(
+                    child,
+                    weight_config=self._weight_config,
+                )
+                new_linear.activation_fake_quantizer = _Float8ActivationFakeQuantizer(
+                    self._activation_scaling_granularity
+                )
+                setattr(model, name, new_linear)
+            else:
+                self.prepare(child)
+        return model
+
+    # TODO: add convert path
+    def convert(
+        self, model: torch.nn.Module, *args: Any, **kwargs: Any
+    ) -> torch.nn.Module:
+        raise NotImplementedError
+
+    def get_activation_fake_quantize_config(self) -> Optional[FakeQuantizeConfig]:
+        raise NotImplementedError("Float8 FakeQuantizeConfig does not exist yet")
+
+    def get_weight_fake_quantize_config(self) -> Optional[FakeQuantizeConfig]:
+        return self.weight_config
diff --git a/torchao/quantization/qat/utils.py b/torchao/quantization/qat/utils.py