resolve super init

flybird11111 · flybird11111 · commit ed4a80863223 · 2024-04-03T18:56:19.000+08:00
resolve super init

resolve super init

resolve super init
diff --git a/colossalai/shardformer/layer/embedding.py b/colossalai/shardformer/layer/embedding.py
@@ -220,7 +220,6 @@ def from_native_module(
         embedding_dim = module.embedding_dim
         padding_idx = module.padding_idx
         device = module.weight.device
-        make_vocab_size_divisible_by = kwargs.pop("make_vocab_size_divisible_by", 64)
 
         # create the parallel module
         padding_embedding = PaddingEmbedding(
@@ -229,7 +228,6 @@ def from_native_module(
             padding_idx=padding_idx,
             device=device,
             weight=module.weight,
-            make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             *args,
             **kwargs,
         )
@@ -343,8 +341,6 @@ def from_native_module(
             assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
             process_group = process_group[0]
 
-        make_vocab_size_divisible_by = kwargs.pop("make_vocab_size_divisible_by", 64)
-
         # create the parallel module
         vocab_embedding_1d = VocabParallelEmbedding1D(
             num_embeddings=num_embeddings,
@@ -353,7 +349,6 @@ def from_native_module(
             device=device,
             process_group=process_group,
             weight=module.weight,
-            make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             *args,
             **kwargs,
         )
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
@@ -82,10 +82,9 @@ def __init__(
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-        *args,
         **kwargs,
     ):
-        super().__init__(*args, **kwargs)
+        super().__init__(weight=weight, bias_=bias_, **kwargs)
 
         # Keep input parameters
         self.in_features = in_features
@@ -141,7 +140,7 @@ def __init__(
 
     @staticmethod
     def from_native_module(
-        module: nn.Linear, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+        module: nn.Linear, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
     ) -> ParallelModule:
         r"""
         Convert a native PyTorch linear layer to a parallelized linear layer.
@@ -174,7 +173,6 @@ def from_native_module(
             process_group=process_group,
             weight=module.weight,
             bias_=module.bias,
-            *args,
             **kwargs,
         )
 
@@ -316,7 +314,7 @@ def __init__(
 
     @staticmethod
     def from_native_module(
-        module: nn.Linear, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+        module: nn.Linear, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
     ) -> ParallelModule:
         r"""
         Convert a native PyTorch linear layer to a parallelized linear layer.
@@ -350,7 +348,6 @@ def from_native_module(
             process_group=process_group,
             weight=module.weight,
             bias_=module.bias,
-            *args,
             **kwargs,
         )
 
@@ -477,7 +474,7 @@ def reset_parameters(self, weight_initializer, bias_initializer) -> None:
 
     @staticmethod
     def from_native_module(
-        module: nn.Linear, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+        module: nn.Linear, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
     ) -> PaddingParallelModule:
         r"""
         Convert a native PyTorch linear layer to a parallelized linear layer.
@@ -489,7 +486,6 @@ def from_native_module(
         bias = module.bias is not None
         device = module.weight.device
         # ensure only one process group is passed
-        make_vocab_size_divisible_by = kwargs.pop("make_vocab_size_divisible_by", 64)
 
         lm_head_linear = PaddingLMHead(
             in_features=in_features,
@@ -498,8 +494,6 @@ def from_native_module(
             device=device,
             weight=module.weight,
             bias_=module.bias,
-            make_vocab_size_divisible_by=make_vocab_size_divisible_by,
-            *args,
             **kwargs,
         )
 
@@ -551,7 +545,6 @@ def __init__(
         weight: Optional[Parameter] = None,
         bias_: Optional[Parameter] = None,
         make_vocab_size_divisible_by: int = 64,
-        *args,
         **kwargs,
     ):
         # create weight and bias
@@ -579,12 +572,9 @@ def __init__(
             process_group=process_group,
             weight=weight,
             bias_=bias_,
-            *args,
             **kwargs,
             new_num_embeddings=new_out_features,
             old_num_embeddings=out_features,
-            weight_A=weight,
-            bias_A=bias_,
         )
 
         # get the length of valid embeddings
@@ -599,7 +589,7 @@ def __init__(
 
     @staticmethod
     def from_native_module(
-        module: nn.Linear, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+        module: nn.Linear, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
     ) -> PaddingParallelModule:
         r"""
         Convert a native PyTorch linear layer to a parallelized linear layer.
@@ -611,8 +601,6 @@ def from_native_module(
         bias = module.bias is not None
         device = module.weight.device
 
-        make_vocab_size_divisible_by = kwargs.pop("make_vocab_size_divisible_by", 64)
-
         lm_head_linear = VocabParallelLMHead1D(
             in_features=in_features,
             out_features=out_features,
@@ -621,41 +609,18 @@ def from_native_module(
             process_group=process_group,
             weight=module.weight,
             bias_=module.bias,
-            make_vocab_size_divisible_by=make_vocab_size_divisible_by,
-            *args,
             **kwargs,
         )
 
         return lm_head_linear
 
     def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
-        assert (
-            input_.shape[-1] == self.weight.shape[-1]
-        ), "Invalid shapes in Linear1D_Col forward: input={}, weight={}. Expected last dim of input {}.".format(
-            input_.shape, self.weight.shape, self.weight.shape[-1]
-        )
-
-        # Set up backprop all-reduce.
-        input_parallel = input_
-
-        # Matrix multiply.
-        bias = self.bias if not self.skip_bias_add else None
-        if self.seq_parallel:
-            output_parallel = linear_gather_forward_reducescatter_backward(
-                input_parallel, self.weight, bias, self.process_group, True, self.seq_parallel_dim, self.overlap
-            )
+        if self.skip_bias_add:
+            output, _ = super().forward(input_)
         else:
-            output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, True)
-
+            output = super().forward(input_)
         if self.gather_output:
-            # All-gather across the partitions.
-            output = gather_forward_split_backward(output_parallel, dim=-1, process_group=self.process_group)
             output = output[..., : self.old_num_embeddings]
         else:
-            output = output_parallel
             output = output[..., : self.num_valid_embeddings_local]
-
-        if self.skip_bias_add:
-            return output, self.bias
-        else:
-            return output
+        return output
diff --git a/colossalai/shardformer/layer/parallel_module.py b/colossalai/shardformer/layer/parallel_module.py
@@ -25,8 +25,8 @@
 
 
 class ParallelModule(nn.Module, ABC):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
+    def __init__(self, **kwargs):
+        super().__init__()
 
     @abstractmethod
     def from_native_module(
@@ -176,21 +176,20 @@ def _load_from_state_dict(
                         unexpected_keys.append(key)
 
 
-class PaddingParallelModule(nn.Module, ABC):
+class PaddingParallelModule(ParallelModule):
     def __init__(
         self,
-        new_num_embeddings: int = None,
-        old_num_embeddings: int = None,
-        weight_A: Optional[nn.Parameter] = None,
-        bias_A: Optional[nn.Parameter] = None,
-        *args,
+        new_num_embeddings: int,
+        old_num_embeddings: int,
+        weight: Optional[nn.Parameter],
+        bias_: Optional[nn.Parameter] = None,
         **kwargs,
     ) -> None:
-        super().__init__(*args, **kwargs)
+        super().__init__(**kwargs)
         self.new_num_embeddings = new_num_embeddings
         self.old_num_embeddings = old_num_embeddings
-        self.weight = weight_A
-        self.bias = bias_A
+        self.weight = weight
+        self.bias = bias_
 
         if not (is_distributed_tensor(self.weight) or self.weight.shape[0] == self.new_num_embeddings):
             self.resize_embedding_weight()