Support export compressed model for AutoRound [2.x] (#1648)

Kaihui-intel · web-flow · commit dfd083df3234 · 2024-03-05T15:24:38.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;]
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
@@ -496,6 +496,9 @@ def export_compressed_model(
                 gptq_config = json.load(f)
         else:
             gptq_config = self.gptq_config if hasattr(self, "gptq_config") else {}
+
+        autoround_config = self.autoround_config if hasattr(self, "autoround_config") else {}
+
         if gptq_config:
             for k, v in weight_config.items():
                 logger.debug(f"Compressing {k} on device {device}")
@@ -555,6 +558,19 @@ def export_compressed_model(
                 )
                 new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
                 set_module(self.model, k, new_module)
+        elif autoround_config:
+            from auto_round.export.export_to_itrex import compress_model  # pylint: disable=E0401
+
+            self.model = compress_model(
+                self.model,
+                weight_config=autoround_config,
+                enable_full_range=enable_full_range,
+                compression_dtype=compression_dtype,
+                compression_dim=compression_dim,
+                device=device,
+                use_optimum_format=use_optimum_format,
+                inplace=True,
+            )
         else:
             for k, v in weight_config.items():
                 logger.debug(f"Compressing {k} on device {device}")
diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py
@@ -801,6 +801,14 @@ def test_AutoRound_quant(self):
         self.assertTrue("scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys())
         self.assertTrue(torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"])
 
+        export_model = q_model.export_compressed_model()
+        export_out = export_model(input)
+        self.assertTrue(torch.allclose(out2[0], export_out[0]))
+        from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
+
+        self.assertTrue(isinstance(q_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
+        self.assertTrue(isinstance(export_model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
+
         fp32_model = copy.deepcopy(self.gptj)
 
         conf = PostTrainingQuantConfig(
@@ -852,8 +860,6 @@ def test_AutoRound_quant(self):
         )
         out2 = export_model.model(input)
         self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01))
-        from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
-
         self.assertTrue(isinstance(export_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear))