do conversion in fp8/fpx related handlers

danielvegamyhre · danielvegamyhre · commit cbcc0802865a · 2025-10-06T10:44:56.000-07:00
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1661,15 +1661,10 @@ def _float8_weight_only_transform(
         "applying int8 weight only quant requires module to have weight attribute"
         + " but {module} does not have one"
     )
-    # If model we're quantizing for inference was trained with torchao float8 training
-    # and checkpointed with the Float8Linears, we need to convert them back to
-    # regular nn.Linears so we can apply inference quantization techniques to them.
+
     if isinstance(module, Float8Linear):
-        with torch.device("meta"):
-            new_module = nn.Linear(module.in_features, module.out_features)
-        new_module.weight = module.weight
-        new_module.bias = module.bias
-        module = new_module
+        module = _unwrap_float8_linear(module)
+
     new_weight = _float8_weight_only_quant_tensor(module.weight, config)
 
     module.weight = torch.nn.Parameter(new_weight, requires_grad=False)
@@ -1879,6 +1874,9 @@ def _float8_dynamic_activation_float8_weight_transform(
         "applying float8 dynamic activation quant requires module to have weight attribute"
         + f"but {module} does not have one"
     )
+    if isinstance(module, Float8Linear):
+        module = _unwrap_float8_linear(module)
+
     quantized_weight = _float8_dynamic_activation_float8_weight_quantize_tensor(
         module.weight, config
     )
@@ -1914,6 +1912,9 @@ def _float8_dynamic_activation_float8_semi_sparse_weight_transform(
 ):
     assert is_sm_at_least_90(), "Float8 quantization is only supported on CUDA>=9.0"
 
+    if isinstance(module, Float8Linear):
+        module = _unwrap_float8_linear(module)
+
     weight = module.weight
     weight_dtype = config.weight_dtype
     activation_dtype = config.activation_dtype
@@ -1978,6 +1979,9 @@ def _float8_static_activation_float8_weight_transform(
         "Float8 static activation quantization is only supported on CUDA 8.9 and above"
     )
 
+    if isinstance(module, Float8Linear):
+        module = _unwrap_float8_linear(module)
+
     scale = config.scale
     activation_dtype = config.activation_dtype
     weight_dtype = config.weight_dtype
@@ -2337,6 +2341,9 @@ def _fpx_weight_only_transform(
     if config.set_inductor_config:
         torchao.quantization.utils.recommended_inductor_config_setter()
 
+    if isinstance(module, Float8Linear):
+        module = _unwrap_float8_linear(module)
+
     from torchao.dtypes import to_affine_quantized_fpx
     from torchao.dtypes.floatx import FloatxTensorCoreLayout
 
@@ -2395,6 +2402,21 @@ def _module_fqn_to_config_handler(
     return module
 
 
+def _unwrap_float8_linear(module: Float8Linear) -> nn.Linear:
+    """
+    Unwrap a torchao Float8Linear by returning a nn.Linear with the same weights and bias.
+
+    Torchao inference quantization techniques are generally only applicable to nn.Linear
+    layers, so this helper is useful for unwrapping models trained with torchao float8 training,
+    which replaces nn.Linear layers with Float8Linear layers.
+    """
+    with torch.device("meta"):
+        new_module = nn.Linear(module.in_features, module.out_features)
+    new_module.weight = module.weight
+    new_module.bias = module.bias
+    return new_module
+
+
 torch.serialization.add_safe_globals(
     [
         _int8_asymm_per_token_quant,