pytorch
diff --git a/‎test/integration/test_integration.py‎
Lines changed: 38 additions & 27 deletions b/‎test/integration/test_integration.py‎
Lines changed: 38 additions & 27 deletions
diff --git a/‎test/quantization/test_quant_api.py‎
Lines changed: 17 additions & 25 deletions b/‎test/quantization/test_quant_api.py‎
Lines changed: 17 additions & 25 deletions
diff --git a/‎torchao/dtypes/aqt.py‎
Lines changed: 8 additions & 6 deletions b/‎torchao/dtypes/aqt.py‎
Lines changed: 8 additions & 6 deletions
@@ -20,11 +20,10 @@
     DynamicallyPerAxisQuantizedLinear,
 )
 from torchao.quantization.quant_api import (
-    apply_dynamic_quant,
-    apply_weight_only_int8_quant,
-    change_linear_weights_to_int8_dqtensors,
-    change_linear_weights_to_int8_woqtensors,
-    change_linear_weights_to_int4_woqtensors,
+    get_apply_int4wo_quant,
+    get_apply_int8wo_quant,
+    get_apply_int8dyn_quant,
+    quantize,
     _replace_with_custom_fn_if_matches_filter,
 )
 from torchao.quantization.quant_primitives import (
@@ -73,7 +72,11 @@
 from parameterized import parameterized
 import itertools
 import logging
-from torchao.utils import TORCH_VERSION_AFTER_2_3, TORCH_VERSION_AFTER_2_4
+from torchao.utils import (
+    TORCH_VERSION_AFTER_2_3,
+    TORCH_VERSION_AFTER_2_4,
+    unwrap_tensor_subclass,
+)
 
 logger = logging.getLogger("INFO")
 
@@ -82,9 +85,9 @@
 
 # TODO: use this to reduce the number of tests
 TENSOR_SUBCLASS_APIS = [
-    change_linear_weights_to_int8_dqtensors,
-    change_linear_weights_to_int8_woqtensors,
-    change_linear_weights_to_int4_woqtensors,
+    get_apply_int4wo_quant,
+    get_apply_int8wo_quant,
+    get_apply_int8dyn_quant,
 ]
 
 COMMON_DEVICES = ["cpu", "cuda"]
@@ -736,7 +739,8 @@ def _test_lin_weight_subclass_api_impl(
             nn.Linear(k, n, device=test_device), nn.ReLU(), nn.Linear(n, n, device=test_device)
         ).to(test_dtype)
         ref_f = mod(x)
-        api(mod)
+        quantize(mod, api())
+        unwrap_tensor_subclass(mod)
 
         test = mod(x)
         self.assertGreater(
@@ -756,13 +760,13 @@ def _test_lin_weight_subclass_api_impl(
     @unittest.skipIf(TORCH_VERSION_AFTER_2_4, "skip because there is some bug in inductor codegen")
     def test_int8_dynamic_quant_subclass_api(self, device, dtype):
         self._test_lin_weight_subclass_api_impl(
-            change_linear_weights_to_int8_dqtensors, device, 35, test_dtype=dtype
+            get_apply_int8dyn_quant, device, 35, test_dtype=dtype
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     def test_int8_weight_only_quant_subclass_api(self, device, dtype):
         self._test_lin_weight_subclass_api_impl(
-            change_linear_weights_to_int8_woqtensors, device, 40, test_dtype=dtype
+            get_apply_int8wo_quant, device, 40, test_dtype=dtype
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
@@ -772,7 +776,7 @@ def test_int4_weight_only_quant_subclass_api(self, device, dtype):
             self.skipTest(f"Fails for {dtype}")
         for test_shape in ([(16, 1024, 16)] + ([(1, 1024, 256)] if device=='cuda' else [])):
             self._test_lin_weight_subclass_api_impl(
-                change_linear_weights_to_int4_woqtensors,
+                get_apply_int4wo_quant,
                 device,
                 15,
                 test_shape=test_shape,
@@ -789,7 +793,7 @@ def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
                 for inner_k_tiles in [4, 2]:
                     kwargs = {"groupsize": groupsize, "inner_k_tiles": inner_k_tiles}
                     self._test_lin_weight_subclass_api_impl(
-                        lambda mod: change_linear_weights_to_int4_woqtensors(mod, **kwargs),
+                        lambda: get_apply_int4wo_quant(**kwargs),
                         device,
                         15,
                         test_shape=test_shape,
@@ -804,7 +808,7 @@ def test_dynamic_quant(self):
         m = nn.Sequential(nn.Linear(K, N))
 
         y_ref = m(x)
-        apply_dynamic_quant(m)
+        quantize(m, get_apply_int8dyn_quant())
         y_test = m(x)
 
         sqnr = compute_error(y_ref, y_test)
@@ -818,7 +822,7 @@ def test_weight_only_quant(self):
             x = torch.randn(*x_shape)
             m = nn.Sequential(nn.Linear(4, 5))
             y_ref = m(x)
-            apply_weight_only_int8_quant(m)
+            quantize(m, get_apply_int8wo_quant())
             y_wo = m(x)
             sqnr = compute_error(y_ref, y_wo)
             self.assertGreater(sqnr, 44.0)
@@ -841,7 +845,8 @@ def test_weight_only_quant_force_mixed_mm(self, device, dtype):
                 x = torch.randn(*x_shape).to(device).to(dtype)
                 m = nn.Sequential(nn.Linear(4, 5)).to(device).to(dtype)
                 y_ref = m(x)
-                apply_weight_only_int8_quant(m)
+                m = quantize(m, get_apply_int8wo_quant())
+                m = unwrap_tensor_subclass(m)
                 m(x)
                 m_c = torch.compile(m, mode="max-autotune")
                 y_wo, (code,) = run_and_get_code(m_c, x)
@@ -868,7 +873,8 @@ def test_weight_only_quant_use_mixed_mm(self, device, dtype):
                 x = torch.randn(*x_shape).to(device).to(dtype)
                 m = nn.Sequential(nn.Linear(4, 5)).to(device).to(dtype)
                 y_ref = m(x)
-                apply_weight_only_int8_quant(m)
+                m = quantize(m, get_apply_int8wo_quant())
+                m = unwrap_tensor_subclass(m)
                 m_c = torch.compile(m, mode="max-autotune")
                 y_wo, (code,) = run_and_get_code(m_c, x)
                 sqnr = compute_error(y_ref, y_wo)
@@ -908,7 +914,9 @@ def forward(self, x):
         ref_f = model(x)
 
         # save quantized state_dict
-        api(model)
+        quantize(model, api())
+        unwrap_tensor_subclass(model)
+
         torch.save(model.state_dict(), "test.pth")
         # get quantized reference
         model_qc = torch.compile(model, mode="max-autotune")
@@ -919,11 +927,13 @@ def forward(self, x):
         # load model structure
         with torch.device('meta'):
             model = test_model().to(dtype=test_dtype)
-        api(model)
+        quantize(model, api())
+        unwrap_tensor_subclass(model)
 
         # load quantized state_dict
         state_dict = torch.load("test.pth", mmap=True)
         os.remove("test.pth")
+
         model.load_state_dict(state_dict, assign=True)
         model = model.to(device=test_device, dtype=test_dtype).eval()
 
@@ -939,20 +949,20 @@ def forward(self, x):
     def test_save_load_dqtensors(self, device, dtype):
         if device == "cpu":
             self.skipTest(f"indcutor failed for cpu right now")
-        self._test_handle_save_load_meta_impl(change_linear_weights_to_int8_dqtensors, device, test_dtype=dtype)
+        self._test_handle_save_load_meta_impl(get_apply_int8dyn_quant, device, test_dtype=dtype)
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @torch.no_grad()
     def test_save_load_int8woqtensors(self, device, dtype):
-        self._test_handle_save_load_meta_impl(change_linear_weights_to_int8_woqtensors, device, test_dtype=dtype)
+        self._test_handle_save_load_meta_impl(get_apply_int8wo_quant, device, test_dtype=dtype)
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "int4 requires torch nightly.")
     @torch.no_grad()
     def test_save_load_int4woqtensors(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
-        self._test_handle_save_load_meta_impl(change_linear_weights_to_int4_woqtensors, device, 20, test_dtype=dtype)
+        self._test_handle_save_load_meta_impl(get_apply_int4wo_quant, device, 20, test_dtype=dtype)
 
 
 class TorchCompileUnitTest(unittest.TestCase):
@@ -1271,8 +1281,8 @@ def forward(self, x):
         model = test_model().to(dtype=test_dtype, device=test_device).eval()
         ref_f = model(x)
 
-        kwargs = {"dtype": test_dtype}
-        api(model, **kwargs)
+        # kwargs = {"dtype": test_dtype}
+        quantize(model, api())
 
         # running model
         model(x)
@@ -1317,8 +1327,9 @@ def forward(self, x):
         model = test_model().to(dtype=test_dtype, device=test_device).eval()
         ref_f = model(x)
 
-        kwargs = {"dtype": test_dtype}
-        api(model, **kwargs)
+        # kwargs = {"dtype": test_dtype}
+        model = quantize(model, api())
+        model = unwrap_tensor_subclass(model)
 
         # running model
         ref = model(x)
 
@@ -35,8 +35,6 @@
 )
 from torchao.quantization.quant_api import (
     _replace_with_custom_fn_if_matches_filter,
-    apply_dynamic_quant,
-    apply_weight_only_int8_quant,
     Quantizer,
     TwoStepQuantizer,
     quantize,
@@ -53,6 +51,7 @@
 from torchao._models.llama.tokenizer import get_tokenizer
 from torchao._models.llama.model import Transformer, prepare_inputs_for_model
 import copy
+import tempfile
 
 
 def dynamic_quant(model, example_inputs):
@@ -62,20 +61,6 @@ def dynamic_quant(model, example_inputs):
     m = convert_pt2e(m)
     return m
 
-def _apply_dynamic_quant(model):
-    """
-    Applies dynamic symmetric per-token activation and per-channel weight
-    quantization to all linear layers in the given model using
-    module swaps.
-    """
-    _replace_with_custom_fn_if_matches_filter(
-        model,
-        lambda linear_mod: dynamic_quant(linear_mod, (torch.randn(1, linear_mod.in_features),)),
-        lambda mod, fqn: isinstance(mod, torch.nn.Linear),
-    )
-    return model
-
-
 def capture_and_prepare(model, example_inputs):
     m = torch.export.export(model, example_inputs)
     quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config(is_dynamic=True))
@@ -104,7 +89,7 @@ def convert(self, model: torch.nn.Module) -> torch.nn.Module:
 
 class TorchCompileDynamicQuantizer(Quantizer):
     def quantize(self, model: torch.nn.Module) -> torch.nn.Module:
-        apply_dynamic_quant(model)
+        quantize(model, get_apply_int8dyn_qunat())
         return model
 
 class ToyLinearModel(torch.nn.Module):
@@ -127,11 +112,13 @@ def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs
     The deprecated implementation for int8 dynamic quant API, used as a reference for
     numerics and performance
     """
-    from torchao.quantization.quant_api import _in_features_greater_than_16
     from torchao.quantization.quant_api import _is_linear
     from torchao.quantization.quant_api import _get_subclass_inserter
     from torchao.quantization.subclass import Int8DynamicallyQuantizedLinearWeight
 
+    def _in_features_greater_than_16(mod, *args):
+        return hasattr(mod, "in_features") and mod.in_features > 16
+
     if filter_fn is None:
         filter_fn = lambda *args: _is_linear(*args) and _in_features_greater_than_16(
             *args
@@ -167,7 +154,7 @@ class TestQuantFlow(unittest.TestCase):
     def test_dynamic_quant_gpu_singleline(self):
         m = ToyLinearModel().eval()
         example_inputs = m.example_inputs()
-        m = _apply_dynamic_quant(m)
+        m = quantize(m, get_apply_int8dyn_quant())
         quantized = m(*example_inputs)
         # AssertionError: Expecting input to have dtype torch.float32, but got dtype: torch.float64
         # While executing %choose_qparams_tensor_1 : [num_users=2] = call_function[target=torch.ops.quantized_decomposed.choose_qparams.tensor](args = (%arg0_3, -128, 127, 0.000244140625, torch.int8), kwargs = {})
@@ -205,16 +192,21 @@ def test_dynamic_quant_gpu_unified_api_eager_mode_impl(self):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_int8_wo_quant_save_load(self):
         m = ToyLinearModel().eval().cpu()
-        apply_weight_only_int8_quant(m)
+        m = quantize(m, get_apply_int8wo_quant())
+
+        from torchao.utils import unwrap_tensor_subclass
+        unwrap_tensor_subclass(m)
         example_inputs = m.example_inputs()
         ref = m(*example_inputs)
-        _TMP_FN = "_test.pt"
-        torch.save(m.state_dict(), _TMP_FN)
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(m.state_dict(), f)
+            f.seek(0)
+            state_dict = torch.load(f)
 
-        state_dict = torch.load(_TMP_FN)
-        os.remove(_TMP_FN)
         m2 = ToyLinearModel().eval()
-        apply_weight_only_int8_quant(m2)
+        m2 = quantize(m2, get_apply_int8wo_quant())
+        unwrap_tensor_subclass(m2)
+
         m2.load_state_dict(state_dict)
         m2 = m2.to(device="cuda")
         example_inputs = map(lambda x: x.cuda(), example_inputs)
 
@@ -386,11 +386,13 @@ def __new__(
         quant_min: Optional[int] = None,
         quant_max: Optional[int] = None,
         zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
+        device=None,
         dtype=None,
+        memory_format=None,
         strides=None,
     ):
         kwargs = {}
-        kwargs["device"] = layout_tensor.device
+        kwargs["device"] = layout_tensor.device if device is None else device
         kwargs["layout"] = (
             kwargs.get("layout") if kwargs.get("layout", False) else layout_tensor.layout
         )
@@ -500,7 +502,7 @@ def from_float(
         )
 
     @property
-    def layout(self) -> str:
+    def extended_layout(self) -> str:
         return self.layout_tensor.extended_layout
 
     @classmethod
@@ -596,8 +598,8 @@ def _quantized_linear_op(input_tensor, weight_qtensor, bias):
                 is_cuda and
                 input_is_int8 and
                 input_tensor.dtype == weight_qtensor.dtype and
-                input_tensor.layout == "plain" and
-                weight_qtensor.layout == "plain"
+                input_tensor.extended_layout == "plain" and
+                weight_qtensor.extended_layout == "plain"
             ):
                 #
                 # 1. do the matrix form of dot(X_i, W_j)
@@ -639,7 +641,7 @@ def _quantized_linear_op(input_tensor, weight_qtensor, bias):
             weight_qtensor.dtype == torch.bfloat16 and
             len(weight_qtensor.shape) == 2 and
             weight_qtensor.zero_point_domain == ZeroPointDomain.FLOAT and
-            weight_qtensor.layout == "tensor_core_tiled"
+            weight_qtensor.extended_layout == "tensor_core_tiled"
         ):
             assert weight_qtensor.block_size[0] == 1, f"Requires groupwise quantization, got block_size: {block_size}"
             assert input_tensor.shape[-1] == weight_qtensor.shape[1], (
@@ -682,7 +684,7 @@ def _quantized_linear_op(input_tensor, weight_qtensor, bias):
             weight_qtensor.block_size[0] == 1 and
             weight_qtensor.block_size[1] == weight_qtensor.shape[1] and
             weight_qtensor.zero_point_domain == ZeroPointDomain.INT and
-            weight_qtensor.layout == "plain"
+            weight_qtensor.extended_layout == "plain"
         ):
             # TODO: enable cpu and mps efficient path
             # per channel int8 weight only quantizated mm