Enable 2D sharding (#17)

alanwaketan · yeounoh · commit 2fe02008b256 · 2024-03-19T10:59:10.000-07:00
Summary:
This pull request adds 2D SPMD sharding to the table. It will shard both weights and activations. Here is the sharding strategy.

Let's say we have a 2D mesh (data, model) and data x model == num_devices:
1. input (data,, None, model)
2. embedding (model, data)
3. attn QKV (data, model)
4. attn O (model, data)
5. mlp gate, up (model, data)
6. mlp down (data, model)
7. activation (data,, None, model)
Currently you can specify the model dimension using a new option --spmd_2d_sharding, then the data dimension will be auto-calculated.

TODO: maybe we should have another option to specify whether or not we should shard the activations/inputs or shard them differently.
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
@@ -189,6 +189,14 @@ class ModelArguments:
             )
         },
     )
+    spmd_2d_sharding: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "Will apply XLA SPMD to 2D sharding, i.e., weights + activations, and spmd_2d_sharding specifies the model dimension"
+            )
+        },
+    )
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -468,6 +476,8 @@ def main():
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
 
+    # Pass the 2d sharding config to the actual model.
+    config.spmd_2d_sharding = model_args.spmd_2d_sharding
     if model_args.model_name_or_path:
         torch_dtype = (
             model_args.torch_dtype
@@ -538,6 +548,42 @@ def main():
             else:
                 assert len(param.shape) == 2
                 xs.mark_sharding(param, mesh, range(len(param.shape)))
+    elif model_args.spmd_2d_sharding > 0:
+        print('Applying 2D sharding to all parameters')
+        for name, param in model.named_parameters():
+            # Apply 2D sharding:
+            # embedding (model, data)
+            # attn QKV (data, model)
+            # attn O (model, data)
+            # mlp gate, up (model, data)
+            # mlp down (data, model)
+            print('> Sharding tensor', name, param.shape)
+            mod = model_args.spmd_2d_sharding
+            data = num_devices // mod
+            assert mod * data == num_devices
+            data_model_mesh = xs.HybridMesh(ici_mesh_shape=(data, mod))
+            model_data_mesh = xs.HybridMesh(ici_mesh_shape=(mod, data))
+
+            # We don't care about layernorm's weights, and
+            # LLaMA doesn't use biases.
+            if len(param.shape) == 1:
+                continue
+
+            if 'embed_tokens' in name:
+                xs.mark_sharding(param, model_data_mesh, range(len(param.shape)))
+            elif 'q_proj' in name or 'k_proj' in name or 'v_proj' in name:
+                xs.mark_sharding(param, data_model_mesh, range(len(param.shape)))
+            elif 'o_proj' in name:
+                xs.mark_sharding(param, model_data_mesh, range(len(param.shape)))
+            elif 'gate_proj' in name or 'up_proj' in name:
+                xs.mark_sharding(param, model_data_mesh, range(len(param.shape)))
+            elif 'down_proj' in name:
+                xs.mark_sharding(param, data_model_mesh, range(len(param.shape)))
+            elif 'lm_head' in name:  # Not sure what this is but has the same shape as embed_tokens
+                xs.mark_sharding(param, model_data_mesh, range(len(param.shape)))
+
+            import torch_xla
+            print(torch_xla._XLAC._get_xla_sharding_spec(param))
 
     # Preprocessing the datasets.
     # First we tokenize all the texts.
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
@@ -401,6 +401,22 @@ def forward(
         if not output_attentions:
             attn_weights = None
 
+        # Apply 2D sharding:
+        # activation (data,, None, model)
+        import torch_xla.core.xla_model as xm
+        import torch_xla.experimental.xla_sharding as xs
+        import torch_xla.runtime as xr
+        import torch_xla
+        num_devices = xr.global_runtime_device_count()
+        device_ids = torch.arange(num_devices)
+        print('> Sharding activations', attn_output.shape)
+        model = self.spmd_2d_sharding
+        data = num_devices // model
+        assert model * data == num_devices
+        data_model_mesh = xs.HybridMesh(ici_mesh_shape=(data, 1, model))
+        xs.mark_sharding(attn_output, data_model_mesh, (0, 1, 2))
+        print(torch_xla._XLAC._get_xla_sharding_spec(attn_output))
+
         return attn_output, attn_weights, past_key_value
 
 
@@ -920,6 +936,9 @@ class LlamaModel(LlamaPreTrainedModel):
 
     def __init__(self, config: LlamaConfig):
         super().__init__(config)
+        # For PyTorch/XLA's SPMD 2D sharding
+        self.spmd_2d_sharding = config.spmd_2d_sharding
+
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -1548,10 +1548,11 @@ def _xla_sharded_dataloader(self, dataloader):
             if self.args.spmd_batch_sharding:
                 mesh = xs.Mesh(device_ids, (num_devices, 1))
                 sharding_spec = xs.ShardingSpec(mesh, (0, 1))
-            elif self.args.spmd_tensor_sharding > 0:
-                tensor = self.args.spmd_tensor_sharding
+            elif self.args.spmd_tensor_sharding > 0 or self.args.spmd_2d_sharding > 0:
+                assert self.args.spmd_tensor_sharding == 0 or self.args.spmd_2d_sharding == 0
+                tensor = self.args.spmd_tensor_sharding + self.args.spmd_2d_sharding
                 fsdp = num_devices // tensor
-                mesh = xs.Mesh(device_ids, (fsdp, tensor))
+                mesh = xs.HybridMesh(ici_mesh_shape=(fsdp, tensor))
                 partition_spec = (0, None)
                 sharding_spec = xs.ShardingSpec(mesh, partition_spec)