flybird11111
diff --git a/‎colossalai/booster/plugin/hybrid_parallel_plugin.py‎
Lines changed: 341 additions & 31 deletions b/‎colossalai/booster/plugin/hybrid_parallel_plugin.py‎
Lines changed: 341 additions & 31 deletions
diff --git a/‎colossalai/booster/plugin/moe_hybrid_parallel_plugin.py‎
Lines changed: 8 additions & 1 deletion b/‎colossalai/booster/plugin/moe_hybrid_parallel_plugin.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎colossalai/inference/tensor_parallel/engine.py‎
Lines changed: 2 additions & 2 deletions b/‎colossalai/inference/tensor_parallel/engine.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎colossalai/shardformer/README.md‎
Lines changed: 18 additions & 0 deletions b/‎colossalai/shardformer/README.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎colossalai/shardformer/layer/__init__.py‎
Lines changed: 4 additions & 1 deletion b/‎colossalai/shardformer/layer/__init__.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎colossalai/shardformer/layer/normalization.py‎
Lines changed: 143 additions & 8 deletions b/‎colossalai/shardformer/layer/normalization.py‎
Lines changed: 143 additions & 8 deletions
diff --git a/‎colossalai/shardformer/layer/utils.py‎
Lines changed: 75 additions & 1 deletion b/‎colossalai/shardformer/layer/utils.py‎
Lines changed: 75 additions & 1 deletion
@@ -338,7 +338,14 @@ def configure(
         if not isinstance(model, ModelWrapper):
             use_ddp = self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0
             model = HybridParallelModule(
-                model, self.precision, self.shard_config, self.dp_group, use_ddp, self.ddp_config, self.custom_policy
+                module=model,
+                precision=self.precision,
+                shard_config=self.shard_config,
+                dp_group=self.dp_group,
+                tp_group=self.tp_group,
+                use_ddp=use_ddp,
+                ddp_config=self.ddp_config,
+                custom_policy=self.custom_policy,
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             if self.zero_stage == 0:
 
@@ -218,10 +218,10 @@ def _shard_model_by(self, shardformer: ShardFormer, model: nn.Module) -> None:
         ), "Discrepancy between the tp size of TPInferEngine and the tp size of shard config"
         model_name = model.__class__.__name__
         assert model_name in self.supported_models, f"Unsupported model cls {model_name} for TP inference."
-
+        
         model = model.model if self.shard_config.inference_gptq else model
+        policy = get_autopolicy(model, shard_config=self.shard_config)
 
-        policy = get_autopolicy(model, inference_only=True)
         self.model, _ = shardformer.optimize(model, policy)
 
         if self.shard_config.inference_gptq:
 
@@ -235,6 +235,14 @@ class SubModuleReplacementDescription:
 
 
 class Policy(ABC):
+    r"""
+    The base class for all the policies. For each different model, it should have a different policy class,
+    like BertPolicy for Bert Model or OPTPolicy for OPT model.
+
+    Shardformer has provided many built-in sharding policies for the mainstream models. You can use the
+    built-in policies by setting `policy = None`, which is already the default argument for `Shardformer.optimize`.
+    If you want to define your own policy, you can inherit from this class and overwrite the methods you want to modify.
+    """
 
     def __init__(self)
         self.model = None
@@ -245,6 +253,16 @@ class Policy(ABC):
         """
         self.model = model
 
+    def set_shard_config(self, shard_config: ShardConfig) -> None:
+        r"""
+        Set shard config as an attribute of the Policy object.
+        Args:
+            shard_config (:class:`ShardConfig`): The shard config to be perform
+        """
+        self.shard_config = shard_config
+
+        self.config_sanity_check()
+
     @abstractmethod
     def preprocess(self) -> nn.Module:
         """
 
@@ -2,7 +2,7 @@
 from .embedding import Embedding1D, VocabParallelEmbedding1D
 from .linear import Linear1D_Col, Linear1D_Row
 from .loss import cross_entropy_1d
-from .normalization import FusedLayerNorm, FusedRMSNorm
+from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
 from .parallel_module import ParallelModule
 from .qkv_fused_linear import FusedLinear1D_Col, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
 
@@ -16,6 +16,9 @@
     "DropoutForParallelInput",
     "DropoutForReplicatedInput",
     "cross_entropy_1d",
+    "BaseLayerNorm",
+    "LayerNorm",
+    "RMSNorm",
     "FusedLayerNorm",
     "FusedRMSNorm",
     "FusedLinear1D_Col",
 
@@ -1,11 +1,15 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import warnings
+from abc import ABC, abstractmethod
+
 import torch.nn as nn
 from colossalai.lazy import LazyInitContext
 from ._operation import hook_paramter_in_backward
 
-__all__ = ["FusedLayerNorm", "FusedRMSNorm"]
+from .utils import SeqParallelUtils
+
+__all__ = ["FusedLayerNorm", "FusedRMSNorm", "LayerNorm", "RMSNorm", "BaseLayerNorm"]
 
 try:
     from apex.contrib.layer_norm.layer_norm import FastLayerNorm
@@ -77,21 +81,128 @@ def forward(self, input):
         return output
 
 
-class FusedLayerNorm:
+class BaseLayerNorm(ABC):
+    @abstractmethod
+    def from_native_module(module: nn.Module, sp_partial_derived: bool = False):
+        """
+        Convert a native PyTorch layer normalization module to a specific layer normalization module,
+        and optionally mark parameters for gradient aggregation.
+
+        Args:
+            module (nn.Module): The native PyTorch layer normalization module to be converted.
+            sp_partial_derived (bool): Whether this module's gradients are partially derived in sequence parallelism.
+
+        Returns:
+            nn.Module: The specific layer normalization module.
+
+        Raises:
+            AssertionError: If the provided module is not an instance of the supported layer normalization type.
+        """
+
+
+class RMSNorm(BaseLayerNorm):
+    r"""
+    This is a wrapper around the RMSNorm. It is meant to be used only with the from_native_module interface.
+    """
+
+    def __init__(self) -> None:
+        raise NotImplementedError(
+            "FusedLayerNorm is not implemented as a physical class. "
+            "It is meant to be used only with the from_native_module interface to convert a native RMSNorm module to colossalai layer norm module."
+        )
+
+    @staticmethod
+    def from_native_module(module: nn.Module, sp_partial_derived: bool = False, *args, **kwargs) -> nn.Module:
+        """
+        Convert a native RMSNorm module to colossalai layer norm module,
+        and optionally mark parameters for gradient aggregation.
+
+        Args:
+            module (nn.Module): The native RMSNorm module to be converted.
+            sp_partial_derived (bool): Whether this module's gradients are partially derived in sequence parallelism.
+
+        Returns:
+            nn.Module: The RMSNorm module.
+        """
+
+        LazyInitContext.materialize(module)
+
+        if sp_partial_derived:
+            # Since gradients are computed using only a subset of the data,
+            # aggregation of these gradients is necessary during backpropagation.
+            # Therefore, we annotate these parameters in advance to indicate the need for gradient aggregation.
+            SeqParallelUtils.marked_as_sp_partial_derived_param(module.weight)
+
+        return module
+
+
+class LayerNorm(BaseLayerNorm):
+    r"""
+    This is a wrapper around the torch.nn.LayerNorm. It is meant to be used only with the from_native_module interface.
+    """
+
+    def __init__(self) -> None:
+        raise NotImplementedError(
+            "LayerNorm is not implemented as a physical class. "
+            "It is meant to be used only with the from_native_module interface to convert a native pytorch layer norm module to colossalai layer norm module."
+        )
+
+    @staticmethod
+    def from_native_module(module: nn.LayerNorm, sp_partial_derived: bool = False, *args, **kwargs) -> nn.Module:
+        r"""
+        Convert a native pytorch layer norm module to colossalai layer norm module,
+        and optionally marking parameters for gradient aggregation.
+
+        Args:
+            module (nn.LayerNorm): The native PyTorch LayerNorm module to be converted.
+            sp_partial_derived (bool): Whether this module's gradients are partially derived in sequence parallelism.
+
+        Returns:
+            nn.Module: The LayerNorm module.
+
+        Raises:
+            AssertionError: If the provided module is not an instance of nn.LayerNorm.
+        """
+        assert isinstance(module, nn.LayerNorm), "Only support conversion from nn.LayerNorm."
+
+        LazyInitContext.materialize(module)
+
+        if sp_partial_derived:
+            # Since gradients are computed using only a subset of the data,
+            # aggregation of these gradients is necessary during backpropagation.
+            # Therefore, we annotate these parameters in advance to indicate the need for gradient aggregation.
+            SeqParallelUtils.marked_as_sp_partial_derived_param(module.weight)
+            SeqParallelUtils.marked_as_sp_partial_derived_param(module.bias)
+
+        return module
+
+
+class FusedLayerNorm(BaseLayerNorm):
     r"""
     This is a wrapper around the apex fused layernorm implementation. It is meant to be used only with the from_native_module interface.
     """
 
     def __init__(self) -> None:
         raise NotImplementedError(
             "FusedLayerNorm is not implemented as a physical class. "
-            "It is meant to be used only with the from_native_module interface to wrap the fused layernorm implementation provided by apex."
+            "It is meant to be used only with the from_native_module interface convert a native pytorch layer norm module to FusedLayerNorm module provided by apex."
         )
 
     @staticmethod
-    def from_native_module(module: nn.LayerNorm, *args, **kwargs) -> nn.Module:
+    def from_native_module(module: nn.LayerNorm, sp_partial_derived: bool = False, *args, **kwargs) -> nn.Module:
         r"""
-        Convert a native pytorch layer norm module to colossalai layer norm module
+        Convert a native pytorch layer norm module to FusedLayerNorm module provided by apex,
+        and optionally marking parameters for gradient aggregation.
+
+        Args:
+            module (nn.LayerNorm): The native PyTorch LayerNorm module to be converted.
+            sp_partial_derived (bool): Whether this module's gradients are partially derived in sequence parallelism.
+
+        Returns:
+            nn.Module: Union[FastLayerNorm, FusedLayerNorm].
+
+        Raises:
+            AssertionError: If the provided module is not an instance of nn.LayerNorm.
         """
 
         LazyInitContext.materialize(module)
@@ -120,21 +231,39 @@ def from_native_module(module: nn.LayerNorm, *args, **kwargs) -> nn.Module:
         layernorm.weight = module.weight
         layernorm.bias = module.bias
 
+        if sp_partial_derived:
+            # Since gradients are computed using only a subset of the data,
+            # aggregation of these gradients is necessary during backpropagation.
+            # Therefore, we annotate these parameters in advance to indicate the need for gradient aggregation.
+            SeqParallelUtils.marked_as_sp_partial_derived_param(layernorm.weight)
+            SeqParallelUtils.marked_as_sp_partial_derived_param(layernorm.bias)
+
         return layernorm
 
 
-class FusedRMSNorm:
+class FusedRMSNorm(BaseLayerNorm):
     """
     This is a wrapper around the apex fused rms norm implementation. It is meant to be used only with the from_native_module interface.
     """
     def __init__(self) -> None:
         raise NotImplementedError(
             "FusedRMSNorm is not implemented as a physical class. "
-            "It is meant to be used only with the from_native_module interface to wrap the fused rms norm implementation provided by apex."
+            "It is meant to be used only with the from_native_module interface to Convert a native RMSNorm module to FusedRMSNorm module provided by apex."
         )
 
     @staticmethod
-    def from_native_module(module: nn.Module, *args, **kwargs) -> nn.Module:
+    def from_native_module(module: nn.Module, sp_partial_derived: bool = False, *args, **kwargs) -> nn.Module:
+        r"""
+        Convert a native RMSNorm module module to FusedRMSNorm module provided by apex,
+        and optionally marking parameters for gradient aggregation.
+
+        Args:
+            module (nn.LayerNorm): The native PyTorch LayerNorm module to be converted.
+            sp_partial_derived (bool): Whether this module's gradients are partially derived in sequence parallelism.
+
+        Returns:
+            nn.Module: FusedRMSNorm module.
+        """
         LazyInitContext.materialize(module)
         # to check if it is huggingface LlamaRMSNorm
         if module.__class__.__name__ == "LlamaRMSNorm":
@@ -151,4 +280,10 @@ def from_native_module(module: nn.Module, *args, **kwargs) -> nn.Module:
 
         rmsnorm.weight = module.weight
 
+        if sp_partial_derived:
+            # Since gradients are computed using only a subset of the data,
+            # aggregation of these gradients is necessary during backpropagation.
+            # Therefore, we annotate these parameters in advance to indicate the need for gradient aggregation.
+            SeqParallelUtils.marked_as_sp_partial_derived_param(rmsnorm.weight)
+
         return rmsnorm
@@ -1,8 +1,82 @@
 from contextlib import contextmanager
+from typing import List
 
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
+from torch import nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.distributed import ProcessGroup, get_world_size
+
+
+class SeqParallelUtils:
+    @staticmethod
+    def marked_as_sp_partial_derived_param(param):
+        """
+        Mark a parameter as partially derived in sequence parallelism.
+
+        Args:
+            param: The parameter to mark as partially derived.
+        """
+        setattr(param, "partial_derived", True)
+
+    @staticmethod
+    def is_sp_partial_derived_param(param):
+        """
+        Check if a parameter is marked as partially derived in sequence parallelism.
+
+        Args:
+            param: The parameter to check.
+
+        Returns:
+            bool: True if the parameter is marked as partially derived, False otherwise.
+        """
+        return getattr(param, "partial_derived", False)
+
+    @staticmethod
+    def allreduce_partial_data_grad(tp_group: ProcessGroup, model: nn.Module = None, grads: List[torch.Tensor] = None):
+        """
+        Allreduce partial derived gradients across the specified process group.
+
+        This function performs gradient synchronization for parameters that are marked as partially derived in sequence parallelism.
+
+        Args:
+            tp_group (ProcessGroup): The process group for gradient synchronization.
+            model (nn.Module): The model from which gradients will be synchronized.
+            grads (List[torch.Tensor]): The list of gradients to be synchronized.
+
+        Raises:
+            AssertionError: If both `model` and `grads` are provided or neither is provided.
+        """
+        # Ensure that exactly one of `model` and `grads` is provided for gradient synchronization.
+        assert (model is not None) ^ (grads is not None), "Exactly one of model and grads must be not None."
+
+        # Get the size of the process group, which determines whether synchronization is needed.
+        tp_size = get_world_size(tp_group) if tp_group is not None else 1
+
+        if tp_size == 1:
+            # If the process group size is 1, no synchronization is required.
+            return
+
+        if model is not None:
+            # If `model` is provided, extract partial derived gradients from the model's parameters.
+            grads = []
+            for p in model.parameters():
+                if p.grad is not None and SeqParallelUtils.is_sp_partial_derived_param(p):
+                    grads.append(p.grad.data)
+
+            # Flatten and reduce the gradients using the specified process group.
+            coalesced = _flatten_dense_tensors(grads)
+            dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=tp_group)
+
+            # Unflatten the synchronized gradients and update the model's gradients.
+            for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                buf.copy_(synced)
+        else:
+            # If `grads` are provided explicitly, synchronize those gradients directly.
+            coalesced = _flatten_dense_tensors(grads)
+            dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=tp_group)
+            for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                buf.copy_(synced)
 
 
 class Randomizer: