intel · yintong-lu · Feb 27, 2024 · Feb 26, 2024 · Feb 26, 2024
diff --git a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh
@@ -30,6 +30,7 @@ pip install torch \
             fvcore \
             pymoo \
             onnxruntime_extensions \
+            peft \
             tf_slim \
             transformers \
             accelerate \

diff --git a/docs/source/smooth_quant.md b/docs/source/smooth_quant.md
@@ -304,7 +304,7 @@ In our experiments, an $\alpha$ range of [0.0, 1.0] with a step_size of 0.1 is f
 *fully automated*: users only need to pass a model and dataloader.
 
 ```python
-from neural_compressor.adaptor.torch_utils.smooth_quant import TorchSmoothQuant
+from neural_compressor.adaptor.torch_utils.waq import TorchSmoothQuant
 
 sq = TorchSmoothQuant(model, dataloader)
 alpha = "auto"  ##alpha could be a float number to disable auto-tuning and enable fixed-value alpha smoothquant.

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -20,7 +20,7 @@
 import math
 import os
 import re
-from collections import OrderedDict, UserDict, namedtuple
+from collections import OrderedDict, UserDict
 from functools import partial
 
 import yaml
@@ -1800,7 +1800,7 @@ def smooth_quant(
                 assert folding, "IPEX version >= 2.1 is required for SmoothQuant folding=False."
 
         if not hasattr(self, "sq") or force_re_smooth:
-            from .torch_utils.smooth_quant import TorchSmoothQuant
+            from neural_compressor.adaptor.torch_utils.waq import TorchSmoothQuant
 
             self.sq = TorchSmoothQuant(
                 model._model, dataloader=dataloader, example_inputs=self.example_inputs, q_func=self.q_func
@@ -1813,17 +1813,18 @@ def smooth_quant(
             kwargs["percentile"] = percentile
         if scales_per_op is not None:
             kwargs["scales_per_op"] = scales_per_op
+        auto_alpha_args["init_alpha"] = default_alpha
         model._model = self.sq.transform(
             alpha=alpha,
             folding=folding,
             calib_iter=calib_iter,
             weight_clip=weight_clip,
-            default_alpha=default_alpha,
             auto_alpha_args=auto_alpha_args,
             **kwargs,
         )
         if self.sq.record_max_info:
             model.sq_max_info = self.sq.max_value_info
+            model.sq_scale_info = self.sq.sq_scale_info
         return model
 
     def _apply_pre_optimization(self, model, tune_cfg, recover=False):
@@ -1840,7 +1841,7 @@ def _apply_pre_optimization(self, model, tune_cfg, recover=False):
         q_model = model._model
         sq_max_info = model.sq_max_info
         if sq_max_info:
-            from .torch_utils.smooth_quant import TorchSmoothQuant
+            from neural_compressor.adaptor.torch_utils.waq import TorchSmoothQuant
 
             tsq = TorchSmoothQuant(q_model, None)
             alpha = tune_cfg["recipe_cfgs"]["smooth_quant_args"]["alpha"]
@@ -1876,8 +1877,9 @@ def qdq_quantize(self, model, tune_cfg):
             model: qdq quantized model.
         """
         q_model = model._model
+        from neural_compressor.adaptor.torch_utils.waq import get_module, set_module
+
         from .torch_utils.model_wrapper import QDQLinear, SQLinearWrapper
-        from .torch_utils.smooth_quant import get_module, set_module
 
         smoothquant_scale_info = {}
         fallback_op_name_list = []
@@ -3317,37 +3319,7 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
         inplace = True if self.performance_only else False
 
         # fetch SmoothQuant scale info from pre-optimized model
-        sq_max_info = model.sq_max_info
-        if sq_max_info:
-            smoothquant_scale_info = {}
-            from .torch_utils.model_wrapper import SQLinearWrapper
-            from .torch_utils.smooth_quant import get_module
-
-            for _, info in sq_max_info.items():
-                alpha = info["alpha"]
-                absorbed_layer = info["absorbed_layer"]
-                input_minmax = info["input_minmax"]
-                # for peft model,lora_B weights is 0.
-                weight_max = info["weight_max"]
-                if self.sq.weight_clip:
-                    weight_max = weight_max.clamp(min=1e-5)
-                abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1]))
-                input_power = torch.pow(abs_input_max, alpha)
-                weight_power = torch.pow(weight_max, 1 - alpha)
-                scale = torch.clip(input_power / weight_power, min=1e-5)
-                for op_name in absorbed_layer:
-                    module = copy.deepcopy(get_module(q_model._model, op_name))
-                    new_module = SQLinearWrapper(module, 1.0 / scale, input_minmax, alpha)
-                    weight_scale = new_module._get_weight_scale()
-                    smoothquant_scale_info[op_name] = {
-                        "alpha": new_module.alpha,
-                        "input_scale_for_mul": new_module.input_scale,
-                        "input_scale_after_mul": new_module.scale,
-                        "input_zero_point_after_mul": new_module.zero_point,
-                        "input_dtype": new_module.dtype,
-                        "weight_scale_after_mul": weight_scale,
-                    }
-                    logger.debug(f"Current SmoothQuant alpha of {op_name} is {alpha}")
+        smoothquant_scale_info = model.sq_scale_info
 
         # Check save_qconf_summary part is a workaround for IPEX bug.
         # Sometimes the prepared model from get_op_capablitiy loss this attribute
@@ -4795,7 +4767,7 @@ def teq_quantize(self, model, tune_cfg, dataloader, calib_func):
 
         supported_layers = ["Linear"]
         if folding:  # pragma: no cover
-            from .torch_utils.smooth_quant import GraphTrace
+            from neural_compressor.adaptor.torch_utils.waq import GraphTrace
 
             tg = GraphTrace()
             absorb_to_layer, _ = tg.get_absorb_to_layer(model, self.example_inputs, supported_layers)

diff --git a/neural_compressor/adaptor/torch_utils/awq.py b/neural_compressor/adaptor/torch_utils/awq.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import copy
-from functools import partial
 
 import torch
 
@@ -25,10 +24,10 @@
     get_hidden_states,
     get_module_input_output,
 )
+from neural_compressor.adaptor.torch_utils.waq import set_module
 
 from ...utils import logger
 from .model_wrapper import MulLinear
-from .smooth_quant import model_forward, set_module
 
 
 def _get_absorb_per_block(model, example_inputs, folding=False, weight_config={}):

diff --git a/neural_compressor/adaptor/torch_utils/layer_wise_quant/quantize.py b/neural_compressor/adaptor/torch_utils/layer_wise_quant/quantize.py
@@ -24,10 +24,10 @@
 from torch.quantization import convert, prepare
 from tqdm import tqdm
 
+from neural_compressor.adaptor.torch_utils.waq import TorchSmoothQuant
 from neural_compressor.config import default_workspace
 
 from ..model_wrapper import QDQLayer
-from ..smooth_quant import TorchSmoothQuant
 from .utils import (
     _get_path,
     clean_module_weight,

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -66,9 +66,9 @@ def forward(self, X):
 
     def qdq_weight(self):
         # update weight w/ QDQ
-        from .smooth_quant import quant_dequant_w
+        from neural_compressor.adaptor.torch_utils.waq.utils import quant_dequant_w_v1
 
-        weith_qdq = quant_dequant_w(self.module)
+        weith_qdq = quant_dequant_w_v1(self.module)
         self.module.weight = torch.nn.Parameter(weith_qdq)
 
 
@@ -139,7 +139,7 @@ def _calculate_qparams(self, input_scale, input_minmax, dtype=torch.quint8):
         min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
         max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
         scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
-        scale = torch.max(scale, torch.tensor([torch.finfo(torch.float32).eps]))
+        scale = torch.max(scale, torch.tensor([torch.finfo(torch.float32).eps], device=scale.device))
         zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int)
         zero_point = torch.clamp(zero_point, quant_min, quant_max)
         return scale, zero_point
@@ -181,7 +181,7 @@ def forward(self, X):
             return X
 
     module_name_list = input_scale_dict.keys()
-    from .smooth_quant import get_module, set_module
+    from neural_compressor.adaptor.torch_utils.waq import get_module, set_module
 
     for name in module_name_list:
         module = get_module(tmp_model, name)
@@ -193,7 +193,7 @@ def forward(self, X):
 
 def _wrapper_qdq_linear(tmp_model, module_name_list=[]):
     """Help function to generate a fake QDQ model for loading weights."""
-    from .smooth_quant import get_module, set_module
+    from neural_compressor.adaptor.torch_utils.waq import get_module, set_module
 
     for name in module_name_list:
         module = get_module(tmp_model, name)