Fix the impl for to for int4 weight only use case

jerryzh168 · jerryzh168 · commit 2019b80a4f65 · 2024-07-17T12:54:00.000-07:00
Summary:
Note that we can do the following right now:
* initialize and quantize the model with int4_weight_only quant in cpu
* move the model to cuda

we'll enable this in a separate PR

Test Plan:
CI

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -624,7 +624,7 @@ def test_quantized_tensor_subclass_save_load(self):
 
     @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    def test_quantized_model_to_device(self):
+    def test_int8wo_quantized_model_to_device(self):
         m = ToyLinearModel().eval().to(torch.bfloat16)
         m_copy = copy.deepcopy(m)
         example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cpu")
@@ -637,6 +637,22 @@ def test_quantized_model_to_device(self):
         cuda_res = m(*example_inputs_cuda)
         self.assertEqual(cuda_res.cpu(), ref)
 
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    def test_int4wo_quantized_model_to_device(self):
+        # TODO: change initial model to "cpu"
+        m = ToyLinearModel().eval().to(torch.bfloat16).to("cuda")
+        m_copy = copy.deepcopy(m)
+        example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cuda")
+
+        quantize_(m, int4_weight_only())
+        ref = m(*example_inputs)
+
+        example_inputs_cuda = (example_inputs[0].to("cuda"),)
+        m.to(device="cuda")
+        cuda_res = m(*example_inputs_cuda)
+        self.assertEqual(cuda_res.cpu(), ref)
+
     @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_quantized_tensor_subclass_save_load_map_location(self):
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -511,8 +511,8 @@ def from_plain(
     def to(self, *args, **kwargs):
         kwargs = self._get_to_kwargs(*args, **kwargs)
         device = kwargs["device"]
-        if device != "cuda" or (isinstance(device, torch.device) and device.type != "cuda"):
-            raise ValueError(f"TensorCoreTiledAQTLayout is only available for cuda device")
+        if device != "cuda" and (isinstance(device, torch.device) and device.type != "cuda"):
+            raise ValueError(f"TensorCoreTiledAQTLayout is only available for cuda device, can't convert to {device}")
         return self.__class__(
             self.packed_weight.to(device),
             self.scale_and_zero.to(device),