remove check_old_param

mobicham · mobicham · commit 5cb7d8154790 · 2024-08-27T11:13:18.000Z
diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py
@@ -66,6 +66,10 @@ def _prepare_for_hqq_linear(model, patch_params, has_been_replaced, current_key_
 
             has_been_replaced = True
 
+            # Add these fake parameters to avoid loading fail
+            for att in ["W_q", "meta"]:
+                setattr(module, att, None)
+
         if len(list(module.children())) > 0:
             _, has_been_replaced = _prepare_for_hqq_linear(
                 module,
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -858,10 +858,14 @@ def _load_state_dict_into_meta_model(
 
     is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn")
 
+    # We add this because HQQLinear dict has a very large state_dict (19 params/per module), which makes loading extremely slow
+    run_expected_keys_check = True
+    if isinstance(hf_quantizer, HqqHfQuantizer):
+        run_expected_keys_check = False
+
     for param_name, param in state_dict.items():
-        # print('param_name', param_name, param_name in loaded_state_dict_keys, param_name in expected_keys)
         # First part of the test is always true as load_state_dict_keys always contains state_dict keys.
-        if param_name not in loaded_state_dict_keys:  # or param_name not in expected_keys: #TODO @mobicham
+        if param_name not in loaded_state_dict_keys or ((param_name not in expected_keys) and run_expected_keys_check):
             continue
 
         if param_name.startswith(start_prefix):
@@ -894,19 +898,15 @@ def _load_state_dict_into_meta_model(
         # uses `param.copy_(input_param)` that preserves the contiguity of the parameter in the model.
         # Reference: https://github.com/pytorch/pytorch/blob/db79ceb110f6646523019a59bbd7b838f43d4a86/torch/nn/modules/module.py#L2040C29-L2040C29
 
-        # TODO @mobicham: We need this for Hqq Quantizer otherwise it would break because state_dict fields (W_q, etc.) are not in nn.Linear
-        check_old_param = True
-        if is_quantized:
-            if isinstance(hf_quantizer, HqqHfQuantizer):
-                check_old_param, old_param = False, None
-
-        if check_old_param:
-            old_param = model
-            splits = param_name.split(".")
-            for split in splits:
-                old_param = getattr(old_param, split)
-                if old_param is None:
-                    break
+        old_param = model
+        splits = param_name.split(".")
+        for split in splits:
+            old_param = getattr(old_param, split)
+            # Not all the attributes of a module are Parameters/Tensor
+            if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)):
+                old_param = None
+            if old_param is None:
+                break
 
         if old_param is not None:
             if dtype is None:
diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py
@@ -143,8 +143,6 @@ def create_quantized_param(
         parent_module = find_parent(model, layer_name)
         node = layer_name.split(".")[-1]
 
-        # print("create_quantized_param | ", 'layer_name', layer_name, type(module), hasattr(module, "quant_config")) #model.layers.0.mlp.down_proj
-
         # set module state_dict
         module_state_dict = {}
         for k, v in state_dict.items():
@@ -154,39 +152,27 @@ def create_quantized_param(
                     unexpected_keys.remove(k)
 
         if self.pre_quantized:
-            if isinstance(module, HQQLinear):
-                return
-            else:
+            if isinstance(module, (torch.nn.Linear, HQQLinear)):
                 hqq_layer = HQQLinear(
                     linear_layer=None,
-                    quant_config=None,  # module.quant_config
+                    quant_config=None,
                     compute_dtype=self.torch_dtype,
                     device=target_device,
                 )
 
-                try:
-                    hqq_layer.load_state_dict(module_state_dict)
-                except Exception:
-                    # TODO @mobicham: Llama3 break with model.layers.28.mlp.down_proj because its parameters are split across 2 safetensors. How to fix this?
-                    # Currently setting a fake layer so that loading doesn't break
-                    print("Error loading, setting a fake layer for", layer_name, module_state_dict.keys())
-                    hqq_layer = HQQLinear(
-                        torch.nn.Linear(in_features=module.in_features, out_features=module.out_features, bias=False),
-                        module.quant_config,
-                        compute_dtype=self.torch_dtype,
-                        device=target_device,
-                        del_orig=True,
-                    )
-
-                if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
-                    hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
-
-                if self.using_multi_gpu:
-                    hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
-
-                setattr(parent_module, node, hqq_layer)
-                torch.cuda.empty_cache()
-                return
+            hqq_layer.axis = None
+            hqq_layer.channel_wise = None
+            hqq_layer.load_state_dict(module_state_dict)
+
+            if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+                hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+            if self.using_multi_gpu:
+                hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
+
+            setattr(parent_module, node, hqq_layer)
+            torch.cuda.empty_cache()
+            return
 
         # Step 1: populate module with weight/bias from module state dict
         for key in module_state_dict: