diff --git a/sagemaker-triton/inferentia2/triton_inferentia2.ipynb b/sagemaker-triton/inferentia2/triton_inferentia2.ipynb
index 6eea66fa3c..b45ae5a9f2 100644
--- a/sagemaker-triton/inferentia2/triton_inferentia2.ipynb
+++ b/sagemaker-triton/inferentia2/triton_inferentia2.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "60a1fdce",
    "metadata": {},
@@ -9,6 +10,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "a6ce5cb1",
    "metadata": {},
@@ -22,6 +24,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "b16f14ea",
    "metadata": {},
@@ -41,6 +44,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "cf042bea",
    "metadata": {},
@@ -158,6 +162,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "7bb2cab3-c977-4d2e-b181-611b2773e30b",
    "metadata": {},
@@ -166,6 +171,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "4f618f8e",
    "metadata": {},
@@ -187,7 +193,7 @@
     "\n",
     "s3_client = boto3.client(\"s3\")\n",
     "s3_client.download_file(\n",
-    "    \"sagemaker-sample-files\", \"datasets/image/pets/shiba_inu_dog.jpg\", \"shiba_inu_dog.jpg\"\n",
+    "    \"sagemaker-example-files-prod-us-east-2\", \"datasets/image/pets/shiba_inu_dog.jpg\", \"shiba_inu_dog.jpg\"\n",
     ")\n",
     "\n",
     "\n",
@@ -204,6 +210,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "c171f622",
    "metadata": {},
@@ -244,6 +251,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "00f0f261-e960-4a00-a9ad-8a884f9f27aa",
    "metadata": {},
@@ -274,6 +282,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "5ea0bd27-0e80-44b6-bb1e-322c34dbb9cb",
    "metadata": {},
@@ -342,6 +351,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "554b50cb-4e32-4ad2-8d59-0391a2294c98",
    "metadata": {},
@@ -373,6 +383,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "d3c6ab5c-5991-4959-8b85-439ab44498ab",
    "metadata": {},
@@ -459,6 +470,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "05b7fd73-2107-4705-922a-80dd7ef16833",
    "metadata": {},
@@ -492,6 +504,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "15ac5e59-936e-4adb-a91e-67db42735307",
    "metadata": {},
@@ -522,6 +535,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "4dc97ed0-3155-4658-96f3-7e058c801e7c",
    "metadata": {},
@@ -592,6 +606,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "aa5cca31-bcd6-4ded-9b1a-085ee8e2094b",
    "metadata": {},
@@ -627,6 +642,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "de473c37-7e5a-4f72-bac1-06524622e41f",
    "metadata": {},
@@ -666,6 +682,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "fbbd1c02-fcbf-4f7c-b05b-aba9775449de",
    "metadata": {},
@@ -686,6 +703,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "ca3088a0-09c8-47dc-b21e-270a6f82df51",
    "metadata": {},
@@ -695,6 +713,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "79ed11d0-9e14-46d4-955c-dd02c04e7867",
    "metadata": {},
@@ -703,6 +722,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "f079437f-f90d-4ff7-b90b-efbbc9625861",
    "metadata": {},
@@ -781,6 +801,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "17500c6b-d59c-44bf-93af-7e1fb7fd6783",
    "metadata": {},
@@ -801,6 +822,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "6fe19a0a-1c82-422f-ac7b-8e3549e79145",
    "metadata": {},
@@ -871,6 +893,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "61f8bb6f-8605-4e81-8ed2-87e9fbbc4f52",
    "metadata": {},
@@ -945,6 +968,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "8ea72aed-98b2-4a2d-9eb6-8f65c04f671e",
    "metadata": {},
@@ -999,6 +1023,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "b3561d5a-9ab0-4205-85ee-4aefacc8f849",
    "metadata": {},
@@ -1008,6 +1033,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "f42bec12",
    "metadata": {},
@@ -1016,6 +1042,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "ecd78917-ab23-46db-941b-8443c767448c",
    "metadata": {},
@@ -1024,6 +1051,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "907d3eb5-acf2-4f10-843a-715e82ea51d6",
    "metadata": {},
@@ -1099,6 +1127,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "076950ad-ab9d-44d2-9826-a463848af213",
    "metadata": {},
@@ -1159,6 +1188,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "81a98829-7497-4e77-944d-0621719f4a71",
    "metadata": {},
@@ -1249,6 +1279,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "ddf79acd-3ad0-4e88-b746-1a831cc257c7",
    "metadata": {},
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/data_pipeline.py b/training/distributed_training/pytorch/model_parallel/gpt2/data_pipeline.py
index cca93d0bd5..4a6c3f0c06 100644
--- a/training/distributed_training/pytorch/model_parallel/gpt2/data_pipeline.py
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/data_pipeline.py
@@ -1,16 +1,14 @@
 import gzip
 import json
-import os
-import h5py
 from typing import List, Tuple
-import random
 
+import h5py
 import numpy as np
 import smdistributed.modelparallel.torch as smp
 import torch
 
 
-class WikiPretrainingDataset(torch.utils.data.Dataset):
+class BertPretrainingDataset(torch.utils.data.Dataset):
     def __init__(self, input_file, max_pred_length):
         self.input_file = input_file
         self.max_pred_length = max_pred_length
@@ -56,9 +54,15 @@ def __getitem__(self, index):
         return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
 
 
-###### Load Openwebtext pretraining data ######
-class OpenwebtextPretrainingDataset(torch.utils.data.Dataset):
-    def __init__(self, input_paths: List[str], max_sequence_length=None, zipped=True, use_last_file_only=False):
+###### Load GPT pretraining data ######
+class GPTPretrainingDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_paths: List[str],
+        max_sequence_length=None,
+        zipped=True,
+        use_last_file_only=False,
+    ):
         self.input_paths = input_paths
         self.max_sequence_length = max_sequence_length
         self.zipped = zipped
@@ -79,11 +83,11 @@ def __read_examples(self, paths: List[str]):
                         self.input_data.extend([ln for _, ln in enumerate(f, 1)])
         else:
             if self.use_last_file_only:
-                with open (paths[-1], "r") as f:
+                with open(paths[-1], "r") as f:
                     self.input_data = [ln for ln in f]
             else:
                 for path in paths:
-                    with open (path, "r") as f:
+                    with open(path, "r") as f:
                         self.input_data.extend([ln for ln in f])
 
         # print(f'__Finished building pretraining dataset with {self.iids.shape[0]} rows__')
@@ -102,15 +106,27 @@ def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
             e_idx = s_idx + self.max_sequence_length
             iids = iids[s_idx:e_idx]
             attns = attns[s_idx:e_idx]
+
+        # Hack to use 4096 seqlen with our existing synthetic data for benchmarking purposes only
+        # iids = iids.repeat(1,2).flatten()
+        # attns = attns.repeat(1,2).flatten()
+        # assert iids.shape[0] == 4096, iids.shape
+
         return iids, attns
 
 
 class DummyDataset(torch.utils.data.dataset.Dataset):
-    def __init__(self, length, data_type="openwebtext"):
-        if data_type == "openwebtext":
+    def __init__(self, length, data_type="GPT"):
+        if data_type == "GPT":
             self.batch = (torch.Tensor(0), torch.Tensor(0))
-        elif data_type == "wiki":
-            self.batch = (torch.Tensor(0), torch.Tensor(0), torch.Tensor(0), torch.Tensor(0), torch.Tensor(0))
+        elif data_type == "BERT":
+            self.batch = (
+                torch.Tensor(0),
+                torch.Tensor(0),
+                torch.Tensor(0),
+                torch.Tensor(0),
+                torch.Tensor(0),
+            )
         self.length = length
 
     def __getitem__(self, index):
@@ -130,26 +146,30 @@ def create_pretraining_dataloader(
     shuffle: bool = False,
     zipped: bool = True,
     use_last_file_only: bool = False,
-    data_type: str = "openwebtext",
+    data_type: str = "GPT",
 ):
     if smp.pp_rank() == 0:
-        if data_type == "openwebtext":
-            data = OpenwebtextPretrainingDataset(
-                input_paths=input_paths, max_sequence_length=max_sequence_length, zipped=zipped, use_last_file_only=use_last_file_only
+        if data_type == "GPT":
+            data = GPTPretrainingDataset(
+                input_paths=input_paths,
+                max_sequence_length=max_sequence_length,
+                zipped=zipped,
+                use_last_file_only=use_last_file_only,
             )
-        elif data_type == "wiki":
+        elif data_type == "BERT":
             if len(input_paths) > 1:
-                print(f"Wiki data only support single file when calling create_pretraining_dataloader, reading the first file instead..")
-            data = WikiPretrainingDataset(input_file=input_paths[0], max_pred_length=max_sequence_length)
+                print(
+                    f"BERT data only support single file when calling create_pretraining_dataloader, reading the first file instead.."
+                )
+            data = BertPretrainingDataset(
+                input_file=input_paths[0], max_pred_length=max_sequence_length
+            )
         else:
             raise ValueError(f"Unsupported data type {data_type}")
+        # TODO: set sampler.epoch to correctly shuffle across epochs, else same order will be used for all epochs
+        # not relevant now as we have no epochs
         sampler = torch.utils.data.DistributedSampler(
-            data,
-            shuffle=shuffle,
-            seed=seed,
-            rank=dp_rank,
-            num_replicas=dp_size,
-            drop_last=True,
+            data, shuffle=shuffle, seed=seed, rank=dp_rank, num_replicas=dp_size, drop_last=True
         )
         dataloader = torch.utils.data.DataLoader(
             data,
@@ -165,4 +185,4 @@ def create_pretraining_dataloader(
         dataset = DummyDataset(data_len * batch_size, data_type=data_type)
         dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, drop_last=True)
 
-    return dataloader
\ No newline at end of file
+    return dataloader
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/data_prep_512.py b/training/distributed_training/pytorch/model_parallel/gpt2/data_prep_512.py
deleted file mode 100644
index f1f22ec0a6..0000000000
--- a/training/distributed_training/pytorch/model_parallel/gpt2/data_prep_512.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
-Download and preprocess the openwebtext dataset using HuggingFace's dataset library
-"""
-import torch
-from datasets import load_dataset
-from transformers import GPT2TokenizerFast
-# download the unprocessed dataset
-dataset = load_dataset('openwebtext', split='train')
-tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-
-# Process the dataset and split it into train and test subsets
-dataset = dataset.map(lambda e: tokenizer(e['text'], max_length=512, truncation=True), num_proc=96)
-print(dataset)
-dataset = dataset.filter(lambda e: len(e['input_ids']) >= 512, num_proc=96)
-print(dataset)
-
-dataset = dataset.remove_columns('text')
-shuffled_dataset = dataset.shuffle(seed=42)
-print(shuffled_dataset)
-dataset=shuffled_dataset.train_test_split(test_size=0.1)
-print(dataset)
-
-train_dataset=dataset['train']
-test_dataset=dataset['test']
-
-print(test_dataset)
-
-# Write the processed dataset into files
-# Specify your own path to save the files
-test_path = "/home/ubuntu/openwebtext_seq_512_no_pad_filtered/val"
-train_path = "/home/ubuntu/openwebtext_seq_512_no_pad_filtered/train"
-
-num_shards=64
-for i in range(0, num_shards):
-    shard_test=test_dataset.shard(num_shards=num_shards, index=i)
-    name=f"{test_path}/test_dataset_512_filtered_{i}"
-    print(name)
-    print(shard_test)
-    shard_test.to_json(f"{name}.json", orient="records", lines=True)
-
-num_shards=512
-print(train_dataset)
-
-for i in range(0, num_shards):
-    name=f"{train_path}/train_dataset_512_filtered_{i}"
-    print(name)
-    shard=train_dataset.shard(num_shards=num_shards, index=i)
-    print(shard)
-    shard.to_json(f"{name}.json", orient="records", lines=True)
\ No newline at end of file
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/learning_rates.py b/training/distributed_training/pytorch/model_parallel/gpt2/learning_rates.py
index e6aa7badae..3939730875 100644
--- a/training/distributed_training/pytorch/model_parallel/gpt2/learning_rates.py
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/learning_rates.py
@@ -1,6 +1,5 @@
 # coding=utf-8
 # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-# Modifications Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,16 +16,26 @@
 """Learning rate decay functions."""
 
 import math
+
 import smdistributed.modelparallel.torch as smp
 
+
 class AnnealingLR(object):
     """Anneals the learning rate."""
 
-    def __init__(self, optimizer, start_lr,
-                 warmup_iter, plateau_iter, total_iters,
-                 decay_style, last_iter, min_lr=0.0,
-                 use_checkpoint_lr_scheduler=True,
-                 override_lr_scheduler=False):
+    def __init__(
+        self,
+        optimizer,
+        start_lr,
+        warmup_iter,
+        plateau_iter,
+        total_iters,
+        decay_style,
+        last_iter,
+        min_lr=0.0,
+        use_checkpoint_lr_scheduler=True,
+        override_lr_scheduler=False,
+    ):
 
         # Class values.
         self.optimizer = optimizer
@@ -41,17 +50,18 @@ def __init__(self, optimizer, start_lr,
         self.override_lr_scheduler = override_lr_scheduler
         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
         if self.override_lr_scheduler:
-            assert not self.use_checkpoint_lr_scheduler, 'both override and '\
-                'use-checkpoint are set.'
+            assert not self.use_checkpoint_lr_scheduler, (
+                "both override and " "use-checkpoint are set."
+            )
         # Set the learning rate
         self.step(self.num_iters)
 
         if smp.rank() == 0:
-            print('Learning rate decay style: {}'.format(self.decay_style))
+            print("Learning rate decay style: {}".format(self.decay_style))
 
     def get_lr(self):
         """Learning rate decay functions from:
-              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
+        https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
 
         num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
         # Warmup.
@@ -59,17 +69,20 @@ def get_lr(self):
             return float(self.start_lr) * num_iters_ / self.warmup_iter
 
         num_iters_ = num_iters_ - self.warmup_iter
-        if self.decay_style == 'linear':
+        if self.decay_style == "linear":
             lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
-        elif self.decay_style == 'plateau':
+        elif self.decay_style == "plateau":
             if self.num_iters <= self.plateau_iter:
                 lr = self.start_lr
             else:
-                lr = self.start_lr * (self.end_iter - self.num_iters) / (self.end_iter - self.plateau_iter)
-        elif self.decay_style == 'cosine':
-            lr = self.start_lr / 2.0 * (math.cos(
-                math.pi * num_iters_ / self.end_iter) + 1)
-        elif self.decay_style == 'exponential':
+                lr = (
+                    self.start_lr
+                    * (self.end_iter - self.num_iters)
+                    / (self.end_iter - self.plateau_iter)
+                )
+        elif self.decay_style == "cosine":
+            lr = self.start_lr / 2.0 * (math.cos(math.pi * num_iters_ / self.end_iter) + 1)
+        elif self.decay_style == "exponential":
             # exp(-0.693) = 1/2
             lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
         else:
@@ -83,16 +96,16 @@ def step(self, step_num=None):
         self.num_iters = step_num
         new_lr = self.get_lr()
         for group in self.optimizer.param_groups:
-            group['lr'] = new_lr
+            group["lr"] = new_lr
 
     def state_dict(self):
         state_dict = {
-            'start_lr': self.start_lr,
-            'warmup_iter': self.warmup_iter,
-            'num_iters': self.num_iters,
-            'decay_style': self.decay_style,
-            'end_iter': self.end_iter,
-            'min_lr': self.min_lr
+            "start_lr": self.start_lr,
+            "warmup_iter": self.warmup_iter,
+            "num_iters": self.num_iters,
+            "decay_style": self.decay_style,
+            "end_iter": self.end_iter,
+            "min_lr": self.min_lr,
         }
         return state_dict
 
@@ -101,31 +114,30 @@ def _check_and_set(self, cls_value, sd_value, name):
         setting them."""
         if self.override_lr_scheduler:
             if smp.rank() == 0:
-                print('Overriding {} value to {}'.format(name, cls_value))
+                print("Overriding {} value to {}".format(name, cls_value))
             return cls_value
 
         if not self.use_checkpoint_lr_scheduler:
-            assert cls_value == sd_value, 'AnnealingLR: class input value' \
-                'and checkpoint values for {} do not match'.format(name)
+            assert (
+                cls_value == sd_value
+            ), "AnnealingLR: class input value" "and checkpoint values for {} do not match".format(
+                name
+            )
         if smp.rank() == 0:
-            print(' > using checkpoint value {} for {}'.format(sd_value,
-                                                                  name))
+            print(" > using checkpoint value {} for {}".format(sd_value, name))
         return sd_value
 
     def load_state_dict(self, sd):
 
-        self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],
-                                            'learning rate')
-        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
-                                          'minimum learning rate')
-        self.warmup_iter = self._check_and_set(self.warmup_iter,
-                                               sd['warmup_iter'],
-                                               'warmup iterations')
-        self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'],
-                                            'total number of iterations')
-        self.decay_style = self._check_and_set(self.decay_style,
-                                               sd['decay_style'],
-                                               'decay style')
-
-        self.num_iters = sd['num_iters']
+        self.start_lr = self._check_and_set(self.start_lr, sd["start_lr"], "learning rate")
+        self.min_lr = self._check_and_set(self.min_lr, sd["min_lr"], "minimum learning rate")
+        self.warmup_iter = self._check_and_set(
+            self.warmup_iter, sd["warmup_iter"], "warmup iterations"
+        )
+        self.end_iter = self._check_and_set(
+            self.end_iter, sd["end_iter"], "total number of iterations"
+        )
+        self.decay_style = self._check_and_set(self.decay_style, sd["decay_style"], "decay style")
+
+        self.num_iters = sd["num_iters"]
         self.step(self.num_iters)
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/memory_tracker.py b/training/distributed_training/pytorch/model_parallel/gpt2/memory_tracker.py
index 329926a26e..b91c9241d8 100644
--- a/training/distributed_training/pytorch/model_parallel/gpt2/memory_tracker.py
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/memory_tracker.py
@@ -1,30 +1,32 @@
-import psutil
 import os
 
+import psutil
 import smdistributed.modelparallel.torch as smp
 import torch
+
 try:
     from py3nvml import py3nvml
 except ImportError:
     py3nvml = None
 
 dtype_to_bit = {
-torch.float32 : 32,
-torch.float64 : 64,
-torch.float16: 16,
-torch.bfloat16: 16,
-torch.uint8: 8,
-torch.int8: 8,
-torch.int16: 16,
-torch.int32: 32,
-torch.int64: 64,
-torch.bool: 1
+    torch.float32: 32,
+    torch.float64: 64,
+    torch.float16: 16,
+    torch.bfloat16: 16,
+    torch.uint8: 8,
+    torch.int8: 8,
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+    torch.bool: 1,
 }
 
 process = psutil.Process(os.getpid())
 base_mem_usage = process.memory_info().data
 last_mem_usage = base_mem_usage
 
+
 def memory_status(msg="", reset_max=True, sync=True):
 
     rank = smp.rank()
@@ -60,11 +62,11 @@ def memory_status(msg="", reset_max=True, sync=True):
     max_cached /= 1024**3
 
     print(
-        f'[{msg}] rank {rank} tp_rank {tp_rank} pp_rank {pp_rank} TORCH {torch.__version__}',
-        f'device={local_rank} '
-        f'alloc {alloced:0.4f} max_alloced {max_alloced:0.4f} '
-        f'cache {cached:0.4f} max_cached {max_cached:0.4f} '
-        f'{total_used_str}'
+        f"[{msg}] rank {rank} tp_rank {tp_rank} pp_rank {pp_rank} TORCH {torch.__version__}",
+        f"device={local_rank} "
+        f"alloc {alloced:0.4f} max_alloced {max_alloced:0.4f} "
+        f"cache {cached:0.4f} max_cached {max_cached:0.4f} "
+        f"{total_used_str}",
     )
     if reset_max:
         torch.cuda.reset_max_memory_cached()
@@ -72,8 +74,10 @@ def memory_status(msg="", reset_max=True, sync=True):
     if py3nvml != None:
         py3nvml.nvmlShutdown()
 
+
 def memory_status_cpu(msg=""):
     import gc
+
     global last_mem_usage
     global base_mem_usage
     rdp_rank = smp.rdp_rank()
@@ -85,8 +89,8 @@ def memory_status_cpu(msg=""):
     torch_usage = 0
     for t in tensors:
         torch_usage += t.numel() * dtype_to_bit[t.dtype]
-    #total_usage = psutil.virtual_memory()[3] # This will get the total usage for all processes
-    current_usage =  process.memory_info().data
+    # total_usage = psutil.virtual_memory()[3] # This will get the total usage for all processes
+    current_usage = process.memory_info().data
     total_usage = current_usage - base_mem_usage
     usage_change = current_usage - last_mem_usage
     last_mem_usage = current_usage
@@ -105,7 +109,7 @@ def memory_status_cpu(msg=""):
         return
 
     print(
-        f'[{msg}] rank {rank} tp_rank {tp_rank} pp_rank {pp_rank} TORCH {torch.__version__}',
-        f'device={local_rank} '
-        f'torch cpu tensor usage {torch_usage:0.4f} cpu mem usage {total_usage:0.4f} change since last measurement {usage_change:0.4f} base cpu mem usage {base_usage:0.4f}'
-    )
\ No newline at end of file
+        f"[{msg}] rank {rank} tp_rank {tp_rank} pp_rank {pp_rank} TORCH {torch.__version__}",
+        f"device={local_rank} "
+        f"torch cpu tensor usage {torch_usage:0.4f} cpu mem usage {total_usage:0.4f} change since last measurement {usage_change:0.4f} base cpu mem usage {base_usage:0.4f}",
+    )
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/model_config.py b/training/distributed_training/pytorch/model_parallel/gpt2/model_config.py
new file mode 100644
index 0000000000..ae3b40b86c
--- /dev/null
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/model_config.py
@@ -0,0 +1,149 @@
+"""Util function to get GPT or BLOOM model configs."""
+
+import logging
+
+from transformers import (  # pylint: disable=import-error
+    AutoConfig,
+    BloomConfig,
+    GPT2Config,
+    GPTNeoXConfig,
+    T5Config,
+)
+
+
+def _get_gpt2_config_from_args(args):
+    """Get GPT2 config."""
+
+    return {
+        "vocab_size": args.vocab_size,
+        "n_positions": args.max_context_width,
+        "n_embd": args.hidden_width,
+        "n_layer": args.num_layers,
+        "n_head": args.num_heads,
+        "n_inner": None,
+        "activation_function": "gelu_new",
+        "resid_pdrop": args.resid_pdrop,
+        "embd_pdrop": args.embd_pdrop,
+        "attn_pdrop": args.attn_pdrop,
+        "layer_norm_epsilon": 1e-05,
+        "initializer_range": args.initializer_range,
+        "summary_type": "cls_index",
+        "summary_use_proj": True,
+        "summary_activation": None,
+        "summary_proj_to_labels": True,
+        "summary_first_dropout": args.summary_first_pdrop,
+        # "gradient_checkpointing": args.gradient_checkpointing > 0,
+        "use_cache": False,
+        "bos_token_id": 50256,
+        "eos_token_id": 50256,
+        "return_dict": True,
+    }
+
+
+def _get_gpt_neox_config_from_args(args):
+    """Get GPTNeoX config."""
+
+    return {
+        "vocab_size": args.vocab_size,
+        "hidden_size": args.hidden_width,
+        "num_hidden_layers": args.num_layers,
+        "num_attention_heads": args.num_heads,
+        "hidden_act": "gelu",
+        "intermediate_size": 4 * args.hidden_width,
+        "rotary_pct": args.rotary_pct,
+        "rotary_emb_base": args.rotary_emb_base,
+        "max_position_embeddings": args.max_context_width,
+        "layer_norm_epsilon": 1e-05,
+        "initializer_range": args.initializer_range,
+        "use_cache": False,
+        "parallel_attn_output": True,
+    }
+
+
+def _get_bloom_config_from_args(args):
+    """Get BLOOM config."""
+
+    return {
+        "vocab_size": args.vocab_size,
+        "hidden_size": args.hidden_width,
+        "n_layer": args.num_layers,
+        "n_head": args.num_heads,
+        "hidden_dropout": 0.0,
+        "attention_dropout": 0.0,
+        "layer_norm_epsilon": 1e-05,
+        "initializer_range": args.initializer_range,
+        "summary_type": "cls_index",
+        "summary_use_proj": True,
+        "summary_activation": None,
+        "summary_proj_to_labels": True,
+        "summary_first_dropout": args.summary_first_pdrop,
+        # "gradient_checkpointing": args.gradient_checkpointing > 0,
+        "use_cache": False,
+        "bos_token_id": 50256,
+        "eos_token_id": 50256,
+        "return_dict": True,
+    }
+
+
+def _get_t5_config_from_args(args):
+    """Get T5 config."""
+
+    return {
+        "vocab_size": args.vocab_size,
+        "d_model": args.hidden_width,
+        "d_kv": 64,
+        "d_ff": args.intermediate_size,
+        "num_layers": args.num_layers,
+        "num_decoder_layers": args.num_layers,
+        "num_heads": args.num_heads,
+        "relative_attention_num_buckets": 32,
+        "relative_attention_max_distance": 128,
+        "dropout_rate": 0.1,
+        "layer_norm_epsilon": 1e-6,
+        "initializer_factor": 1.0,
+        "feed_forward_proj": "gated-gelu",
+        "is_encoder_decoder": True,
+        "use_cache": False,
+        "pad_token_id": 0,
+        "eos_token_id": 1,
+        "decoder_start_token_id": 0,
+    }
+
+
+def get_model_config_from_args(model_type, model_name, args, log=False):
+    """Get model config for GPT or BLOOM: From cmd args."""
+    if model_name:
+        logging.info(f"Loading config from HF model {model_name}")
+        return AutoConfig.from_pretrained(model_name), args
+
+    if model_type == "gpt2":
+        config_type = GPT2Config
+        config_kwargs = _get_gpt2_config_from_args(args)
+    elif model_type == "gpt_neox":
+        config_type = GPTNeoXConfig
+        config_kwargs = _get_gpt_neox_config_from_args(args)
+    elif model_type == "bloom":
+        config_type = BloomConfig
+        config_kwargs = _get_bloom_config_from_args(args)
+        if args.use_distributed_transformer > 0:
+            args.use_distributed_transformer = 0
+            logging.warning(
+                "DistributedTransformer does not support Bloom, falling back "
+                "to regular HF implementation."
+            )
+    elif model_type == "flan_t5":
+        config_type = T5Config
+        config_kwargs = _get_t5_config_from_args(args)
+        if args.use_distributed_transformer > 0:
+            args.use_distributed_transformer = 0
+            logging.warning(
+                "DistributedTransformer does not support T5, falling back "
+                "to regular HF implementation."
+            )
+
+    if log:
+        logging.info("Args for model %s:", model_type)
+        for key, value in sorted(config_kwargs.items()):
+            logging.info("  config %-20s: %s", key, value)
+
+    return config_type(**config_kwargs), args
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/sdp_utils.py b/training/distributed_training/pytorch/model_parallel/gpt2/sdp_utils.py
new file mode 100644
index 0000000000..a0033abab9
--- /dev/null
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/sdp_utils.py
@@ -0,0 +1,38 @@
+import torch
+
+
+def build_param_id_to_offset(param_groups):
+    param_id_to_offset = []
+    for i, group in enumerate(param_groups):
+        offset = 0
+        group_offsets = {}
+        for p in group["params"]:
+            size = p.ds_tensor.ds_numel
+            group_offsets[id(p)] = (offset, size)
+            offset += size
+        param_id_to_offset.append(group_offsets)
+    return param_id_to_offset
+
+
+def build_param_id_to_buffer(optimizer, param_id_to_offset):
+    param_id_to_buffer = {}
+    for i, group in enumerate(optimizer.param_groups):
+        for _id, (offset, sz) in param_id_to_offset[i].items():
+            buf = optimizer.fp32_partitioned_groups_flat[i].narrow(0, offset, sz)
+            param_id_to_buffer[_id] = buf
+    return param_id_to_buffer
+
+
+def log_param_norms(model, optimizer, param_id_to_buffer):
+    weight_norms = {}
+    other_norms = {}
+    for name, param in model.named_parameters():
+        buf = param_id_to_buffer[id(param)]
+        param_norm = torch.linalg.norm(buf) ** 2
+        other_norm = torch.linalg.norm(param.ds_tensor.data) ** 2
+        torch.distributed.all_reduce(param_norm, group=optimizer.ds_param_shard_group)
+        torch.distributed.all_reduce(other_norm, group=optimizer.ds_param_shard_group)
+        weight_norms[name] = torch.sqrt(param_norm).item()
+        other_norms[name] = torch.sqrt(other_norm).item()
+        if smp.rank() == 0:
+            print(f"{name}: {weight_norms[name]} {other_norms[name]}")
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/sharded_data_parallel_checkpoint.py b/training/distributed_training/pytorch/model_parallel/gpt2/sharded_data_parallel_checkpoint.py
deleted file mode 100644
index e9e7ebd79e..0000000000
--- a/training/distributed_training/pytorch/model_parallel/gpt2/sharded_data_parallel_checkpoint.py
+++ /dev/null
@@ -1,240 +0,0 @@
-import torch
-import glob
-import math
-import os
-import re
-import gc
-from collections import OrderedDict
-
-# load to cpu
-device = torch.device('cpu')
-smp_prefix = "module."
-
-def atoi(text):
-    return int(text) if text.isdigit() else text
-
-
-def natural_keys(text):
-    '''
-    alist.sort(key=natural_keys) sorts in human order
-    http://nedbatchelder.com/blog/200712/human_sorting.html
-    (See Toothy's implementation in the comments)
-    '''
-    return [ atoi(c) for c in re.split(r'(\d+)', text) ]
-
-def get_model_state_file(checkpoint_dir):
-    if not os.path.isdir(checkpoint_dir):
-        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
-    file = os.path.join(checkpoint_dir, "model_0.pt")
-
-    if not os.path.exists(file):
-        raise FileNotFoundError(f"can't find model states file at '{file}'")
-
-    return file
-
-def get_optim_files(checkpoint_dir):
-    optim_files = sorted(glob.glob(os.path.join(checkpoint_dir, "optimizer_*.pt")), key=natural_keys)
-
-    if len(optim_files) == 0:
-        raise FileNotFoundError(
-            f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
-
-    return optim_files
-
-def get_user_content_file(checkpoint_dir):
-    file = os.path.join(checkpoint_dir, "user_content.pt")
-    if not os.path.exists(file):
-        raise FileNotFoundError(f"can't find user content file at '{file}'")
-    return file
-
-def parse_model_state(model_file, user_content_file, dtype):
-    state_dict = torch.load(model_file, map_location=device)
-    user_content = torch.load(user_content_file, map_location=device)
-
-    if "buffer_names" not in user_content:
-        raise ValueError(f"{user_content_file} miss buffer_names to reconstruct the full state")
-    if "param_shapes" not in user_content:
-        raise ValueError(f"{user_content_file} miss param_shapes to reconstruct the full state")
-    buffer_names = user_content["buffer_names"]
-    param_shapes = user_content["param_shapes"]
-
-    # recover just the buffers while restoring them to the specified dtype
-    buffers = {
-        k: v.to(dtype)
-        for k,
-        v in state_dict["module"].items() if k in buffer_names
-    }
-
-    return buffers, param_shapes
-
-def parse_optim_states(files, checkpoint_dir, dtype):
-    total_files = len(files)
-    state_dicts = []
-    sharded_data_parallel_size = None
-    # param_shapes = None
-    fp32_groups_key = None
-    for i, f in enumerate(files):
-        states = torch.load(f, map_location=device)
-        if i == 0:
-            sharded_data_parallel_size = states["partition_count"]
-        states["fp32_flat_groups"] = [group.to(dtype) for group in states["fp32_flat_groups"]]
-        state_dicts.append(states["fp32_flat_groups"])
-
-    if type(sharded_data_parallel_size) is list:
-        sharded_data_parallel_size = max(sharded_data_parallel_size)
-
-    if sharded_data_parallel_size != total_files:
-        raise ValueError(
-            f"Expected {sharded_data_parallel_size} of 'optimizer_*.pt' under '{checkpoint_dir}' but found {total_files} files. "
-            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
-        )
-
-    flat_groups = [
-        torch.cat(state_dicts[i],
-                  0) for i in range(len(state_dicts))
-    ]    
-
-    return sharded_data_parallel_size, flat_groups
-
-def partitioned_param_info(unpartitioned_numel, sharded_data_parallel_size):
-    remainder = unpartitioned_numel % sharded_data_parallel_size
-    padding_numel = (sharded_data_parallel_size - remainder) if remainder else 0
-    partitioned_numel = math.ceil(unpartitioned_numel / sharded_data_parallel_size)
-    return partitioned_numel, padding_numel
-    
-def get_full_state_dict_from_sharded_data_parallel_checkpoint(checkpoint_dir, dtype=torch.float32, tag=None, remove_smp_prefix=True):
-    """
-    Returns full state_dict reconstructed from sharded data parallel checkpoint
-
-    Args:
-        - checkpoint_dir: path to the sharded data parallel checkpoint folder (where the optimizer files are)
-        - dtype: the dtype of the output full checkpoint
-        - tag: the checkpoint tag, if not specified will read the newest checkpoint
-        - remove_smp_prefix: remove the "module." prefix created by smp
-
-    """
-    if tag is None:
-        latest_path = os.path.join(checkpoint_dir, 'newest')
-        if os.path.isfile(latest_path):
-            with open(latest_path, 'r') as fd:
-                tag = fd.read().strip()
-        else:
-            raise ValueError(f"Unable to find 'newest' file at {latest_path}")
-
-    checkpoint_dir = os.path.join(checkpoint_dir, tag)
-
-    if not os.path.isdir(checkpoint_dir):
-        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
-
-    print(f"Processing checkpoint '{checkpoint_dir}'")
-
-    optim_files = get_optim_files(checkpoint_dir)
-    sharded_data_parallel_size, flat_groups = parse_optim_states(optim_files, checkpoint_dir, dtype)
-
-    model_file = get_model_state_file(checkpoint_dir)
-    user_content_file = get_user_content_file(checkpoint_dir)
-    buffers, param_shapes = parse_model_state(model_file, user_content_file, dtype)
-    
-    gc.collect()
-    avail_numel = flat_groups[0].numel() * sharded_data_parallel_size
-    # merge list of dicts, preserving order
-    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
-    
-    # params
-    offset = 0
-    total_numel = 0
-    total_params = 0
-
-    state_dict = OrderedDict()
-    state_dict.update(buffers)
-
-    for name, shape in param_shapes.items():
-        if remove_smp_prefix and name.startswith(smp_prefix):
-            name = name[len(smp_prefix):]
-
-        unpartitioned_numel = shape.numel()
-        total_numel += unpartitioned_numel
-        total_params += 1
-
-        partitioned_numel, partitioned_padding_numel = partitioned_param_info(unpartitioned_numel, sharded_data_parallel_size)
-
-        print(
-            f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
-        )
-
-        # memory usage doubles here
-        state_dict[name] = torch.cat(
-                tuple(flat_groups[i].narrow(0,
-                                                 offset,
-                                                 partitioned_numel) 
-                    for i in range(sharded_data_parallel_size)),
-                0).narrow(0,
-                        0,
-                        unpartitioned_numel).view(shape)
-        offset += partitioned_numel
-
-    offset *= sharded_data_parallel_size
-
-    # Sanity check
-    if offset != avail_numel:
-        raise ValueError(
-            f"consumed {offset} numels out of {avail_numel} - something is wrong")
-
-    print(
-        f"Reconstructed state dict with {total_params} params {total_numel} elements"
-    )
-
-    return state_dict
-
-def get_param_shapes(model, optimizer):
-    """Returns a dict of name to shape mapping, only for the flattened weights saved by the
-    optimizer. the names are exactly as in state_dict. The order is absolutely important, since
-    the saved data is just flattened data with no identifiers and requires reconstruction in the
-    same order it was saved.
-
-    We can't rely on module.named_parameters() to get the saved tensors, as some params
-    will be missing and others unsaved and then it'd be impossible to reconstruct state_dict
-    from the flattened weights.
-    """
-    param_group_shapes = []
-    cnt = 0
-    numel = 0
-
-    bit16_groups = optimizer.fp16_groups
-    param_names = {param: name for name, param in model.module.named_parameters()}
-
-    for bit16_group in bit16_groups:
-        param_shapes = OrderedDict()
-        for param in bit16_group:
-            cnt += 1
-            numel += param.ds_numel if hasattr(param, "ds_numel") else param.numel()
-            shape = param.ds_shape if hasattr(param, "ds_shape") else param.shape
-            if param not in param_names:
-                raise ValueError(f"failed to find optimizer param in named params")
-            name = param_names[param]
-            param_shapes[name] = shape
-
-        param_group_shapes.append(param_shapes)
-
-    return param_group_shapes
-
-def get_buffer_names(model):
-    buffer_names = []
-
-    # we save buffer names so that we could extract later the real buffers from the saved
-    # state_dict["module"] in the non-zero checkpoint - the buffers are already there but they
-    # are intermixed with param placeholders
-
-    # have to traverse the tree to be able to skip non-persistent buffers
-    def get_layer_named_buffers(module, prefix=""):
-        for name, buf in module.named_buffers(recurse=False):
-            if buf is not None and name not in module._non_persistent_buffers_set:
-                buffer_names.append(prefix + name)
-
-        for name, child in module.named_children():
-            if child is not None:
-                get_layer_named_buffers(child, prefix + name + ".")
-
-    get_layer_named_buffers(model.module, prefix="")
-
-    return buffer_names
\ No newline at end of file
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/smp-fine-tune-gpt-sharded-data-parallel.ipynb b/training/distributed_training/pytorch/model_parallel/gpt2/smp-fine-tune-gpt-sharded-data-parallel.ipynb
new file mode 100644
index 0000000000..ab24518dbc
--- /dev/null
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/smp-fine-tune-gpt-sharded-data-parallel.ipynb
@@ -0,0 +1,965 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-tune GPT-2 with near-linear scaling using Sharded Data Parallelism technique in SageMaker Model Parallelism Library"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-west-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook, you learn how to fine-tune the Hugging Face Transformers GPT-2 model with the [Sharded Data Parallelism](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-sharded-data-parallelism.html) technique in [SageMaker's Model Parallelism library (SMP)](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html) with PyTorch 1.13 and [GLUE/SST2 dataset](https://huggingface.co/datasets/glue/viewer/sst2/train) on SageMaker. \n",
+    "\n",
+    "The GPT-2 model was proposed by OpenAI in the paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The original GPT-2 is a large transformer-based language model with 1.5 billion parameters. In this notebook, you can experiment with the model parameters to achieve different model sizes. This notebook uses the [Hugging Face Transformers GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html) implementation with the SMP integration.\n",
+    "\n",
+    "Sharded data parallelism is a distributed training technique that splits the model parameters, gradients, and optimizer states across GPUs in a data parallel group. It is purpose-built for extreme-scale models and leverages Amazon in-house [MiCS](https://arxiv.org/pdf/2205.00119.pdf) technology which achieves a near-linear scaling efficiency. For large models that cannot fit into a single GPU, we recommend to use the sharded data parallelism technique with [Activation Checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-activation-checkpointing.html) and [Activation Offloading](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-activation-offloading.html) in SMP first, before leveraging other techniques such as tensor parallelism or pipeline parallelism.\n",
+    "\n",
+    "\n",
+    "This notebook is accompanied with the following files:\n",
+    "\n",
+    "- `train.py`: The entry point script that'll be passed to the SageMaker PyTorch estimator later in this notebook when launching the training job. This script is prepared to run an end-to-end training of the GPT-2 model with SMP, settings for sharded data parallelism applied, and implemented with code lines to save, load, and fine-tune the model. You can follow the comments throughout the script to learn where the SMP APIs and code modifications are implemented.\n",
+    "- `data_pipeline.py`: This has data pipeline functions to prepare the training dataset.\n",
+    "- `learining_rate.py`: This has functions for learning rate schedule.\n",
+    "- `requirements.txt`: This installs the dependencies, including huggingface transformers.\n",
+    "- `memory_tracker.py`: This has functions to track memory usage.\n",
+    "- `model_config.py`: This has functions to get model configuration information.\n",
+    "- `sdp_utils.py`: This has util functions for sharded data parallelism\n",
+    "\n",
+    "### Additional Resources\n",
+    "- To learn more about the SageMaker model parallelism library, see [Model Parallel Distributed Training with SageMaker Distributed](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html).\n",
+    "\n",
+    "- To learn more about using the SageMaker Python SDK with PyTorch, see [Using PyTorch with the SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html).\n",
+    "\n",
+    "- To learn more about launching a training job in Amazon SageMaker with your own training image, see [Use Your Own Training Algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html).\n",
+    "\n",
+    "- To learn more about sharded data parallelism, check [Sharded Data Parallelism](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-sharded-data-parallelism.html) or the blog [Near-linear scaling of gigantic-model training on AWS](https://www.amazon.science/blog/near-linear-scaling-of-gigantic-model-training-on-aws).\n",
+    "\n",
+    "### Prerequisites\n",
+    "You must create an S3 bucket to store the input data for training. This bucket must be located in the same AWS Region that you choose to launch your training job. To learn how to create a S3 bucket, see [Create your first S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html) in the *Amazon S3 documentation*.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Amazon SageMaker Initialization\n",
+    "\n",
+    "Run the following cell to import SageMaker modules and retrieve information of your current SageMaker work environment, such as your AWS account ID, the AWS Region, and the ARN of your Amazon SageMaker execution role. Upgrade SageMaker SDK to the latest version. \n",
+    "\n",
+    "**NOTE:** This step might require a kernel restart."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install --upgrade sagemaker\n",
+    "%pip install sagemaker-experiments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "import os\n",
+    "\n",
+    "import boto3\n",
+    "import sagemaker\n",
+    "from sagemaker import get_execution_role\n",
+    "from sagemaker.pytorch import PyTorch\n",
+    "\n",
+    "role = (\n",
+    "    get_execution_role()\n",
+    ")  # provide a pre-existing role ARN as an alternative to creating a new role\n",
+    "print(f\"SageMaker Execution Role: {role}\")\n",
+    "\n",
+    "client = boto3.client(\"sts\")\n",
+    "account = client.get_caller_identity()[\"Account\"]\n",
+    "print(f\"AWS account: {account}\")\n",
+    "\n",
+    "session = boto3.session.Session()\n",
+    "region = session.region_name\n",
+    "print(f\"AWS region: {region}\")\n",
+    "\n",
+    "sm_boto_client = boto3.client(\"sagemaker\")\n",
+    "sagemaker_session = sagemaker.session.Session(boto_session=session)\n",
+    "\n",
+    "# get default bucket\n",
+    "default_bucket = sagemaker_session.default_bucket()\n",
+    "print()\n",
+    "print(\"Default bucket for this session: \", default_bucket)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download and prepare GLUE/SST2 data\n",
+    "Here you will download, prepare the GLUE/SST2 dataset and then copy the files to S3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Install the Hugging Face Transformers and Datasets libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install -q datasets transformers==4.21.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "from datasets import load_dataset, load_from_disk, load_metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.pytorch import PyTorch\n",
+    "import transformers\n",
+    "import logging\n",
+    "\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    ")\n",
+    "\n",
+    "from transformers.testing_utils import CaptureLogger"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logger = logging.getLogger(__name__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load data\n",
+    "This section loads the [GLUE/SST2](https://huggingface.co/datasets/glue/viewer/sst2/train) dataset and splits it to training and validation datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyperparameters = {\n",
+    "    \"dataset_name\": \"glue\",\n",
+    "    \"dataset_config_name\": \"sst2\",\n",
+    "    \"do_train\": True,\n",
+    "    \"do_eval\": True,\n",
+    "    \"cache_dir\": \"tmp\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_datasets = load_dataset(\n",
+    "    hyperparameters[\"dataset_name\"],\n",
+    "    hyperparameters[\"dataset_config_name\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if \"validation\" not in raw_datasets.keys():\n",
+    "    raw_datasets[\"validation\"] = load_dataset(\n",
+    "        hyperparameters[\"dataset_name\"],\n",
+    "        hyperparameters[\"dataset_config_name\"],\n",
+    "        split=\"train[:5%]\",\n",
+    "        cache_dir=hyperparameters[\"cache_dir\"],\n",
+    "    )\n",
+    "\n",
+    "    raw_datasets[\"train\"] = load_dataset(\n",
+    "        hyperparameters[\"dataset_name\"],\n",
+    "        hyperparameters[\"dataset_config_name\"],\n",
+    "        split=\"train[5%:]\",\n",
+    "        cache_dir=hyperparameters[\"cache_dir\"],\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load tokenizer\n",
+    "Nearly every NLP task begins with a tokenizer. A tokenizer converts your text data into a format (token) that can be processed by the NLP model.\n",
+    "The following cell loads a tokenizer for GPT-2 using [AutoTokenizer.from_pretrained()](https://huggingface.co/docs/transformers/v4.19.4/en/autoclass_tutorial#autotokenizer)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer_kwargs = {\n",
+    "    \"cache_dir\": hyperparameters[\"cache_dir\"],\n",
+    "}\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\", **tokenizer_kwargs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Preprocess data\n",
+    "\n",
+    "The following two cells set up a function to run the tokenizer and group texts into chunks smaller than the block size."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    tok_logger = transformers.utils.logging.get_logger(\"transformers.tokenization_utils_base\")\n",
+    "\n",
+    "    with CaptureLogger(tok_logger) as cl:\n",
+    "        output = tokenizer(examples[text_column_name])\n",
+    "        # clm input could be much much longer than block_size\n",
+    "        if \"Token indices sequence length is longer than the\" in cl.out:\n",
+    "            tok_logger.warning(\n",
+    "                \"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model.\"\n",
+    "            )\n",
+    "    return output\n",
+    "\n",
+    "\n",
+    "# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.\n",
+    "def group_texts(examples):\n",
+    "    # Concatenate all texts.\n",
+    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
+    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+    "    # customize this part to your needs.\n",
+    "    if total_length >= block_size:\n",
+    "        total_length = (total_length // block_size) * block_size\n",
+    "        # Split by chunks of max_len.\n",
+    "        result = {\n",
+    "            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+    "            for k, t in concatenated_examples.items()\n",
+    "        }\n",
+    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "column_names = raw_datasets[\"train\"].column_names\n",
+    "text_column_name = \"text\" if \"text\" in column_names else column_names[0]\n",
+    "\n",
+    "# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function\n",
+    "tok_logger = transformers.utils.logging.get_logger(\"transformers.tokenization_utils_base\")\n",
+    "\n",
+    "tokenized_datasets = raw_datasets.map(\n",
+    "    tokenize_function,\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    "    remove_columns=column_names,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")\n",
+    "\n",
+    "\n",
+    "block_size = tokenizer.model_max_length\n",
+    "if block_size > 1024:\n",
+    "    logger.warning(\n",
+    "        f\"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). \"\n",
+    "        \"Picking 1024 instead. You can change that default value by passing --block_size xxx.\"\n",
+    "    )\n",
+    "    block_size = 1024\n",
+    "else:\n",
+    "    if block_size > tokenizer.model_max_length:\n",
+    "        logger.warning(\n",
+    "            f\"The block_size passed ({block_size}) is larger than the maximum length for the model\"\n",
+    "            f\"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}.\"\n",
+    "        )\n",
+    "    block_size = min(block_size, tokenizer.model_max_length)\n",
+    "\n",
+    "lm_datasets = tokenized_datasets.map(\n",
+    "    group_texts,\n",
+    "    batched=True,\n",
+    "    #     num_proc=args.preprocessing_num_workers,\n",
+    "    desc=f\"Grouping texts in chunks of {block_size}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set additional hyperparameters and S3 paths for mapping the train and validation datasets properly depending on the phase (training or validation) of the training job in each epoch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if hyperparameters[\"do_train\"]:\n",
+    "    if \"train\" not in tokenized_datasets:\n",
+    "        raise ValueError(\"--do_train requires a train dataset\")\n",
+    "    train_dataset = lm_datasets[\"train\"]\n",
+    "\n",
+    "\n",
+    "if hyperparameters[\"do_eval\"]:\n",
+    "    if \"validation\" not in tokenized_datasets:\n",
+    "        raise ValueError(\"--do_eval requires a validation dataset\")\n",
+    "    eval_dataset = lm_datasets[\"validation\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_dataset_location = None\n",
+    "validation_dataset_location = None\n",
+    "\n",
+    "\n",
+    "if hyperparameters[\"do_train\"]:\n",
+    "    train_dataset.to_json(\"./training.json\")\n",
+    "    training_dataset_location = \"s3://{}/dataset/train/\".format(default_bucket)\n",
+    "\n",
+    "if hyperparameters[\"do_eval\"]:\n",
+    "    eval_dataset.to_json(\"./validation.json\")\n",
+    "    validation_dataset_location = \"s3://{}/dataset/validation/\".format(default_bucket)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if training_dataset_location is not None:\n",
+    "    command = \"aws s3 cp ./training.json {}\".format(training_dataset_location)\n",
+    "    os.system(command)\n",
+    "\n",
+    "if validation_dataset_location is not None:\n",
+    "    command = \"aws s3 cp ./validation.json {}\".format(validation_dataset_location)\n",
+    "    os.system(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if hyperparameters[\"do_train\"]:\n",
+    "    command = \"rm ./training.json\"\n",
+    "    os.system(command)\n",
+    "\n",
+    "if hyperparameters[\"do_eval\"]:\n",
+    "    command = \"rm ./validation.json\"\n",
+    "    os.system(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%store training_dataset_location\n",
+    "%store validation_dataset_location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%store"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Specify Amazon S3 bucket paths"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here you need to specify the paths for training data to be used by your job. The bucket used must be in the same region as where training will run. In the cells above you downloaded the GLUE/SST2 training and validation split datasets and uploaded the json files in an S3 bucket in your account. This example will train on those json files.\n",
+    "\n",
+    "After you successfully run this example tensor parallel training job, you can modify the S3 bucket to where your own dataset is stored."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%store -r training_dataset_location\n",
+    "%store -r validation_dataset_location\n",
+    "\n",
+    "# if you're bringing your own data, uncomment the following lines and specify the locations there\n",
+    "# training_dataset_location = YOUR_S3_BUCKET/training\n",
+    "# validation_dataset_location = YOUR_S3_BUCKET/validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3_train_bucket = training_dataset_location\n",
+    "s3_test_bucket = validation_dataset_location"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following S3 bucket will store the output artifacts of the training job. You can modify this as needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3_output_bucket = f\"s3://sagemaker-{region}-{account}/smp-tensorparallel-outputdir/\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Data Channels for SageMaker Training Using Amazon S3\n",
+    "\n",
+    "In this step, define SageMaker training data channels to the S3 buckets.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set use_fsx to False by default\n",
+    "# Set below var to True if you want to use fsx (see next cell)\n",
+    "use_fsx = False\n",
+    "if not use_fsx:\n",
+    "    if s3_train_bucket != None:\n",
+    "        train = sagemaker.inputs.TrainingInput(\n",
+    "            s3_train_bucket, distribution=\"FullyReplicated\", s3_data_type=\"S3Prefix\"\n",
+    "        )\n",
+    "        data_channels = {\"train\": train}\n",
+    "    else:\n",
+    "        data_channels = {\"train\": mock_data}\n",
+    "    if s3_test_bucket != None:\n",
+    "        test = sagemaker.inputs.TrainingInput(\n",
+    "            s3_test_bucket, distribution=\"FullyReplicated\", s3_data_type=\"S3Prefix\"\n",
+    "        )\n",
+    "        data_channels[\"test\"] = test\n",
+    "    else:\n",
+    "        data_channels[\"test\"] = mock_data\n",
+    "    print(data_channels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## (Optional) Set Up and Use Amazon FSx for Data Channels and Checkpoints\n",
+    "\n",
+    "While the previous option of using Amazon S3 is easier to setup, using an FSx can be beneficial for performance when dealing with large input sizes and large model sizes. If you are using models above 13B, checkpointing should be done using FSx. \n",
+    "\n",
+    "Please see the instructions from [Distributed Training of Mask-RCNN in Amazon SageMaker Using FSx](https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/distributed_tensorflow_mask_rcnn/mask-rcnn-scriptmode-fsx.ipynb) to create an FSx Lustre file system and import the dataset from the S3 bucket to your FSx file system. Note that the FSx file system must be created in a private subnet with internet gateway to ensure that training job has access to the internet. For general guidance on setting an FSx Lustre file system as data input channel, see [Configure Data Input Channel to Use Amazon FSx for Lustre](https://docs.aws.amazon.com/sagemaker/latest/dg/model-access-training-data.html#model-access-training-data-fsx)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instructions obtained from:\n",
+    "# https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/distributed_tensorflow_mask_rcnn/mask-rcnn-scriptmode-fsx.ipynb\n",
+    "\n",
+    "if use_fsx:\n",
+    "    from sagemaker.inputs import FileSystemInput\n",
+    "\n",
+    "    # Specify FSx Lustre file system id.\n",
+    "    file_system_id = \"<your-file-system-id>\"\n",
+    "\n",
+    "    # Specify the SG and subnet used by the FSX, these are passed to SM Estimator so jobs use this as well\n",
+    "    fsx_security_group_id = \"<your-security-group-id>\"\n",
+    "    fsx_subnet = \"<your-subnet>\"\n",
+    "\n",
+    "    # Specify directory path for input data on the file system.\n",
+    "    # You need to provide normalized and absolute path below.\n",
+    "    # Your mount name can be provided by you when creating fsx, or generated automatically.\n",
+    "    # You can find this mount_name on the FSX page in console.\n",
+    "    # Example of fsx generated mount_name: \"3x5lhbmv\"\n",
+    "    base_path = \"<your-mount-name>\"\n",
+    "\n",
+    "    # Specify your file system type.\n",
+    "    file_system_type = \"FSxLustre\"\n",
+    "\n",
+    "    train = FileSystemInput(\n",
+    "        file_system_id=file_system_id,\n",
+    "        file_system_type=file_system_type,\n",
+    "        directory_path=base_path,\n",
+    "        file_system_access_mode=\"rw\",\n",
+    "    )\n",
+    "\n",
+    "    data_channels = {\"train\": train, \"test\": train}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set hyperparameters, metric definitions, and MPI options\n",
+    "The following `hyperparameters` dictionary passes arguments to the training script (`train.py`) and set the model parallel configuration when creating the training job.\n",
+    "\n",
+    "You can also add custom `mpi` flags. By default, we have `--mca btl_vader_single_copy_mechanism none` to remove unnecessary logs.\n",
+    "\n",
+    "Next, we add a base metric definitions to enable the metric upload in SageMaker. You can add any further metric definitions.\n",
+    "\n",
+    "Note that we add the `sharded_data_parallel_degree` parameter to the `hyperparameter` dictionary. This will be parsed and used when we configure a SageMaker PyTorch estimator to activate sharded data parallelism.\n",
+    "\n",
+    "Also note that we add the `fine_tune` parameter that activates the code lines for fine-tuning in the script `train.py`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyperparameters = {\n",
+    "    \"max_steps\": 100,\n",
+    "    \"seed\": 12345,\n",
+    "    \"fp16\": 0,\n",
+    "    \"bf16\": 1,\n",
+    "    \"lr\": 2.0e-4,\n",
+    "    \"lr_decay_iters\": 125000,\n",
+    "    \"min_lr\": 0.00001,\n",
+    "    \"lr-decay-style\": \"linear\",\n",
+    "    \"warmup\": 0.01,\n",
+    "    \"num_kept_checkpoints\": 5,\n",
+    "    \"checkpoint_freq\": 200,\n",
+    "    \"logging_freq\": 1,\n",
+    "    \"save_final_full_model\": 0,\n",
+    "    \"delayed_param\": 1,\n",
+    "    \"use_distributed_transformer\": 1,\n",
+    "    \"offload_activations\": 0,\n",
+    "    \"gradient_accumulation\": 1,\n",
+    "    \"validation_freq\": 200,\n",
+    "    \"train_batch_size\": 10,\n",
+    "    \"val_batch_size\": 4,\n",
+    "    \"flash_attention\": 1,\n",
+    "    \"zipped_data\": 0,\n",
+    "    \"epochs\": 100,\n",
+    "    # parameters for activating the fine tuning mode\n",
+    "    \"fine_tune\": 1,\n",
+    "    \"model_name\": \"gpt2-xl\" , \n",
+    "    # parameters for sharded data parallelism\n",
+    "    \"sharded_data_parallel_degree\": 8,\n",
+    "}\n",
+    "\n",
+    "if use_fsx:\n",
+    "    # make sure to update paths for training-dir and test-dir based on the paths of datasets in fsx\n",
+    "    # If you want to resume training, set checkpoint-dir to the same path as a previous job.\n",
+    "    SM_TRAIN_DIR = \"/opt/ml/input/data/train\"\n",
+    "    hyperparameters[\"checkpoint-dir\"] = f\"{SM_TRAIN_DIR}/checkpointdir-job2\"\n",
+    "    hyperparameters[\"model-dir\"] = f\"{SM_TRAIN_DIR}/modeldir-job2\"\n",
+    "    hyperparameters[\"training-dir\"] = f\"{SM_TRAIN_DIR}/datasets/pytorch_gpt2/train_synthetic\"\n",
+    "    hyperparameters[\"test-dir\"] = f\"{SM_TRAIN_DIR}/datasets/pytorch_gpt2/val_synthetic\"\n",
+    "\n",
+    "# The checkpoint path (hyperparameters['checkpoint-dir'] or checkpoint_s3_uri) is not unique per job.\n",
+    "# You need to modify as needed for different runs.\n",
+    "# If same path is used for unrelated runs, this may increase time when downloading unnecessary checkpoints,\n",
+    "# and cause conflicts when loading checkpoints.\n",
+    "\n",
+    "mpioptions = \"-x NCCL_DEBUG=WARN -x SMDEBUG_LOG_LEVEL=ERROR \"\n",
+    "mpioptions += (\n",
+    "    \"-x SMP_DISABLE_D2D=1 -x SMP_D2D_GPU_BUFFER_SIZE_BYTES=1 -x SMP_NCCL_THROTTLE_LIMIT=1 \"\n",
+    ")\n",
+    "mpioptions += \"-x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1\"\n",
+    "\n",
+    "metric_definitions = [\n",
+    "    {\"Name\": \"base_metric\", \"Regex\": \"<><><><><><>\"}\n",
+    "]  # Add your custom metric definitions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the model configuration. Specify one from `gpt2-30b`, `gpt2-xl` and `gpt2-small`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_config = \"gpt2-xl\"\n",
+    "\n",
+    "if model_config == \"gpt2-30b\":\n",
+    "    model_params = {\n",
+    "        \"max_context_width\": 2048,\n",
+    "        \"hidden_width\": 7168,\n",
+    "        \"num_layers\": 48,\n",
+    "        \"num_heads\": 64,\n",
+    "    }\n",
+    "\n",
+    "elif model_config == \"gpt2-xl\":\n",
+    "    # 1.5B\n",
+    "    model_params = {\n",
+    "        \"max_context_width\": 2048,\n",
+    "        \"hidden_width\": 1536,\n",
+    "        \"num_layers\": 48,\n",
+    "        \"num_heads\": 24,\n",
+    "    }\n",
+    "elif model_config == \"gpt2-small\":\n",
+    "    model_params = {\n",
+    "        \"max_context_width\": 2048,\n",
+    "        \"hidden_width\": 768,\n",
+    "        \"num_layers\": 12,\n",
+    "        \"num_heads\": 12,\n",
+    "    }\n",
+    "else:\n",
+    "    raise RuntimeError(\"Unknown model config\")\n",
+    "\n",
+    "for k, v in model_params.items():\n",
+    "    hyperparameters[k] = v"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Specify essential parameters for a SageMaker Training job\n",
+    "\n",
+    "Next, you use the [`SageMaker Estimator class`](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) to define a SageMaker Training Job, passing values through the following parameters for training job name, the number of EC2 instances, the instance type, and the size of the volume attached to the instances. \n",
+    "\n",
+    "* `instance_count`\n",
+    "* `instance_type`\n",
+    "* `volume_size`\n",
+    "* `base_job_name`\n",
+    "\n",
+    "### Update the type and the number of EC2 instance to use\n",
+    "\n",
+    "The instance type and the number of instances you specify to the `instance_type` and `instance_count` parameters, respectively, determine the total number of GPUs (world size).\n",
+    "\n",
+    "$$ \\text{(world size) = (the number of GPUs on a single instance)}\\times\\text{(the number of instances)}$$\n",
+    "\n",
+    "- For GPT-2 with 30-billion parameters, you need at least 16 `ml.p4d.24xlarge` instances.\n",
+    "- For GPT-2 xl, use 1 `ml.p4d.24xlarge` at least.\n",
+    "- For GPT-2 small, use 1 `ml.p3.16xlarge` at least."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "instance_type = \"ml.p4d.24xlarge\"\n",
+    "instance_count = 1\n",
+    "\n",
+    "# set to the number of GPUs on that instance\n",
+    "processes_per_host = 8"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To look up the number of GPUs of different instance types, see [Amazon EC2 Instance Types](https://aws.amazon.com/ec2/instance-types/). Use the section **Accelerated Computing** to see general purpose GPU instances. Note that, for example, a given instance type `p4d.24xlarge` has a corresponding instance type `ml.p4d.24xlarge` in SageMaker.\n",
+    "For SageMaker supported `ml` instances and cost information, see [Amazon SageMaker Pricing](https://aws.amazon.com/sagemaker/pricing/). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Specify a base job name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "machine_str = instance_type.split(\".\")[1] + instance_type.split(\".\")[2][:3]\n",
+    "sharding_degree = hyperparameters[\"sharded_data_parallel_degree\"]\n",
+    "base_job_name = (\n",
+    "    f'smp-{model_config}-{machine_str}-sdp{sharding_degree}-bs{hyperparameters[\"train_batch_size\"]}'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not use_fsx:\n",
+    "    # If you want to resume training, set checkpoint_s3_uri to the same path as a previous job.\n",
+    "    # Previous checkpoint to load must have same model config.\n",
+    "    checkpoint_bucket = f\"s3://sagemaker-{region}-{account}/\"\n",
+    "    checkpoint_s3_uri = (\n",
+    "        f\"{checkpoint_bucket}/experiments/gpt_synthetic_simpletrainer_checkpoints/{base_job_name}/\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"base_job_name: {base_job_name} checkpoint_s3_uri: {checkpoint_s3_uri}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a SageMaker PyTorch estimator\n",
+    "\n",
+    "The following cell constructs a PyTorch estimator using the parameters defined above. To see how the SageMaker APIs and functions are applied to the script, see the `train.py` file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kwargs = {}\n",
+    "if use_fsx:\n",
+    "    # Use the security group and subnet that was used to create the fsx filesystem\n",
+    "    kwargs[\"security_group_ids\"] = [fsx_security_group_id]\n",
+    "    kwargs[\"subnets\"] = [fsx_subnet]\n",
+    "\n",
+    "smp_estimator = PyTorch(\n",
+    "    entry_point=\"train.py\",\n",
+    "    source_dir=os.getcwd(),\n",
+    "    role=role,\n",
+    "    instance_type=instance_type,\n",
+    "    instance_count=instance_count,\n",
+    "    sagemaker_session=sagemaker_session,\n",
+    "    distribution={\n",
+    "        \"mpi\": {\n",
+    "            \"enabled\": True,\n",
+    "            \"processes_per_host\": processes_per_host,\n",
+    "            \"custom_mpi_options\": mpioptions,\n",
+    "        },\n",
+    "        \"smdistributed\": {\n",
+    "            \"modelparallel\": {\n",
+    "                \"enabled\": True,\n",
+    "                \"parameters\": {\n",
+    "                    \"ddp\": True,\n",
+    "                    \"skip_tracing\": True,\n",
+    "                    \"delayed_parameter_initialization\": hyperparameters[\"delayed_param\"] > 0,\n",
+    "                    \"offload_activations\": hyperparameters[\"offload_activations\"] > 0,\n",
+    "                    \"sharded_data_parallel_degree\": hyperparameters[\"sharded_data_parallel_degree\"],\n",
+    "                    \"fp16\": hyperparameters[\"fp16\"] > 0,\n",
+    "                    \"bf16\": hyperparameters[\"bf16\"] > 0,\n",
+    "                    # partitions is a required param in the current SM SDK so it needs to be passed,\n",
+    "                    \"partitions\": 1,\n",
+    "                },\n",
+    "            }\n",
+    "        },\n",
+    "    },\n",
+    "    framework_version=\"1.13\",\n",
+    "    py_version=\"py39\",\n",
+    "    output_path=s3_output_bucket,\n",
+    "    checkpoint_s3_uri=checkpoint_s3_uri if not use_fsx else None,\n",
+    "    checkpoint_local_path=hyperparameters[\"checkpoint-dir\"] if use_fsx else None,\n",
+    "    metric_definitions=metric_definitions,\n",
+    "    hyperparameters=hyperparameters,\n",
+    "    debugger_hook_config=False,\n",
+    "    disable_profiler=True,\n",
+    "    base_job_name=base_job_name,\n",
+    "    **kwargs,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, run the `estimator.fit` method to launch the SageMaker training job of fine-tuning the GPT-2 model with sharded data parallelism."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "smp_estimator.fit(inputs=data_channels, logs=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Accessing the Training Logs\n",
+    "\n",
+    "You can access the training logs from [Amazon CloudWatch](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/WhatIsCloudWatch.html). Make sure to look at the logs of **algo-1** because that is the main node whose output stream has the training job logs.\n",
+    "\n",
+    "You can use CloudWatch to track SageMaker GPU and memory utilization during training and inference. To view the metrics and logs that SageMaker writes to CloudWatch, see [SageMaker Jobs and Endpoint Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html#cloudwatch-metrics-jobs) in the Amazon SageMaker Developer Guide.\n",
+    "\n",
+    "If you are a new user of CloudWatch, see [Getting Started with Amazon CloudWatch](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/GettingStarted.html). \n",
+    "\n",
+    "For additional information on monitoring and analyzing Amazon SageMaker training jobs, see [Monitor and Analyze Training Jobs Using Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html).\n",
+    "\n",
+    "## Deploying Trained Model for Inference\n",
+    "\n",
+    "In most cases, a trained model can be deployed on a single device for inference because inference only requires a small amount of memory. You can use the SMP API to create a single, unified model after training: the [smp.DistributedModel.save_model()](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_tensorflow.html#smp.DistributedModel.save_model) method for TensorFlow, and the [smp.save()](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_pytorch.html#apis-for-saving-and-loading) function for PyTorch.\n",
+    "\n",
+    "After you build and train your models, you can deploy them to get predictions in one of two ways:\n",
+    "\n",
+    "* To set up a persistent endpoint to get predictions from your models, use SageMaker hosting services. For an overview on deploying a single model or multiple models with SageMaker hosting services, see [Deploy a Model on SageMaker Hosting Services](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-deployment.html#how-it-works-hosting).\n",
+    "* To get predictions for an entire dataset, use SageMaker batch transform. For an overview on deploying a model with SageMaker Batch Transform, see [Get Inferences for an Entire Dataset with Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-batch.html).\n",
+    "\n",
+    "To learn more about deploying models for inference using SageMaker, see [Deploy Models for Inference](https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html). \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Notebook CI Test Results\n",
+    "\n",
+    "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-east-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-east-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-west-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ca-central-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/sa-east-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-3/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-central-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-north-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-southeast-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-southeast-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-northeast-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-northeast-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-south-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "hide_input": false,
+  "instance_type": "ml.t3.medium",
+  "kernelspec": {
+   "display_name": "conda_pytorch_p310",
+   "language": "python",
+   "name": "conda_pytorch_p310"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-sharded-data-parallel.ipynb b/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-sharded-data-parallel.ipynb
new file mode 100644
index 0000000000..0245941762
--- /dev/null
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-sharded-data-parallel.ipynb
@@ -0,0 +1,964 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train GPT-2 with near-linear scaling using the sharded data parallelism technique in the SageMaker Model Parallelism library"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-west-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook, you learn how to train the Hugging Face Transformers GPT-2 model with the [Sharded Data Parallelism](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-sharded-data-parallelism.html) technique in [SageMaker's Model Parallelism library (SMP)](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html) with PyTorch 1.13 and [GLUE/SST2 dataset](https://huggingface.co/datasets/glue/viewer/sst2/train) on SageMaker. \n",
+    "\n",
+    "The GPT-2 model was proposed by OpenAI in the paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The original GPT-2 is a large transformer-based language model with 1.5 billion parameters. In this notebook, you can experiment with the model parameters to achieve different model sizes. This notebook uses the [Hugging Face Transformers GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html) implementation with the SMP integration.\n",
+    "\n",
+    "Sharded data parallelism is a distributed training technique that splits the model parameters, gradients, and optimizer states across GPUs in a data parallel group. It is purpose-built for extreme-scale models and leverages Amazon in-house [MiCS](https://arxiv.org/pdf/2205.00119.pdf) technology which achieves a near-linear scaling efficiency. For large models that cannot fit into a single GPU, we recommend to use the sharded data parallelism technique with [Activation Checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-activation-checkpointing.html) and [Activation Offloading](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-activation-offloading.html) in SMP first, before leveraging other techniques such as tensor parallelism or pipeline parallelism.\n",
+    "\n",
+    "\n",
+    "This notebook is accompanied with the following files:\n",
+    "\n",
+    "- `train.py`: The entry point script that'll be passed to the SageMaker PyTorch estimator later in this notebook when launching the training job. This script is prepared to run an end-to-end training of the GPT-2 model with SMP, settings for sharded data parallelism applied, and implemented with code lines to save, load, and fine-tune the model. You can follow the comments throughout the script to learn where the SMP APIs and code modifications are implemented.\n",
+    "- `data_pipeline.py`: This has data pipeline functions to prepare the training dataset.\n",
+    "- `learining_rate.py`: This has functions for learning rate schedule.\n",
+    "- `requirements.txt`: This installs the dependencies, including huggingface transformers.\n",
+    "- `memory_tracker.py`: This has functions to track memory usage.\n",
+    "- `model_config.py`: This has functions to get model configuration information.\n",
+    "- `sdp_utils.py`: This has util functions for sharded data parallelism\n",
+    "\n",
+    "### Additional Resources\n",
+    "- To learn more about the SageMaker model parallelism library, see [Model Parallel Distributed Training with SageMaker Distributed](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html).\n",
+    "\n",
+    "- To learn more about using the SageMaker Python SDK with PyTorch, see [Using PyTorch with the SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html).\n",
+    "\n",
+    "- To learn more about launching a training job in Amazon SageMaker with your own training image, see [Use Your Own Training Algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html).\n",
+    "\n",
+    "- To learn more about sharded data parallelism, check [Sharded Data Parallelism](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-sharded-data-parallelism.html) or the blog [Near-linear scaling of gigantic-model training on AWS](https://www.amazon.science/blog/near-linear-scaling-of-gigantic-model-training-on-aws).\n",
+    "\n",
+    "### Prerequisites\n",
+    "You must create an S3 bucket to store the input data for training. This bucket must be located in the same AWS Region that you choose to launch your training job. To learn how to create a S3 bucket, see [Create your first S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html) in the *Amazon S3 documentation*.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Amazon SageMaker Initialization\n",
+    "\n",
+    "Run the following cell to import the SageMaker modules and retrieve information of your current SageMaker work environment, such as your AWS account ID, the AWS Region, and the ARN of your Amazon SageMaker execution role. Upgrade SageMaker SDK to the latest version. \n",
+    "\n",
+    "**NOTE:** This step might require a kernel restart."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install --upgrade sagemaker\n",
+    "# %pip install sagemaker-experiments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "import os\n",
+    "\n",
+    "import boto3\n",
+    "import sagemaker\n",
+    "from sagemaker import get_execution_role\n",
+    "from sagemaker.pytorch import PyTorch\n",
+    "\n",
+    "role = (\n",
+    "    get_execution_role()\n",
+    ")  # provide a pre-existing role ARN as an alternative to creating a new role\n",
+    "print(f\"SageMaker Execution Role: {role}\")\n",
+    "\n",
+    "client = boto3.client(\"sts\")\n",
+    "account = client.get_caller_identity()[\"Account\"]\n",
+    "print(f\"AWS account: {account}\")\n",
+    "\n",
+    "session = boto3.session.Session()\n",
+    "region = session.region_name\n",
+    "print(f\"AWS region: {region}\")\n",
+    "\n",
+    "sm_boto_client = boto3.client(\"sagemaker\")\n",
+    "sagemaker_session = sagemaker.session.Session(boto_session=session)\n",
+    "\n",
+    "# get default bucket\n",
+    "default_bucket = sagemaker_session.default_bucket()\n",
+    "print()\n",
+    "print(\"Default bucket for this session: \", default_bucket)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download and prepare GLUE/SST2 data\n",
+    "Here you will download, prepare the GLUE/SST2 dataset and then copy the files to S3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Install the Hugging Face Transformers and Datasets libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "! pip install -q datasets transformers==4.21.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "from datasets import load_dataset, load_from_disk, load_metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.pytorch import PyTorch\n",
+    "import transformers\n",
+    "print(transformers.__version__)\n",
+    "import logging\n",
+    "\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    ")\n",
+    "\n",
+    "from transformers.testing_utils import CaptureLogger"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logger = logging.getLogger(__name__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load data\n",
+    "This section loads the [GLUE/SST2](https://huggingface.co/datasets/glue/viewer/sst2/train) dataset and splits it to training and validation datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyperparameters = {\n",
+    "    \"dataset_name\": \"glue\",\n",
+    "    \"dataset_config_name\": \"sst2\",\n",
+    "    \"do_train\": True,\n",
+    "    \"do_eval\": True,\n",
+    "    \"cache_dir\": \"tmp\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_datasets = load_dataset(\n",
+    "    hyperparameters[\"dataset_name\"],\n",
+    "    hyperparameters[\"dataset_config_name\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if \"validation\" not in raw_datasets.keys():\n",
+    "    raw_datasets[\"validation\"] = load_dataset(\n",
+    "        hyperparameters[\"dataset_name\"],\n",
+    "        hyperparameters[\"dataset_config_name\"],\n",
+    "        split=\"train[:5%]\",\n",
+    "        cache_dir=hyperparameters[\"cache_dir\"],\n",
+    "    )\n",
+    "\n",
+    "    raw_datasets[\"train\"] = load_dataset(\n",
+    "        hyperparameters[\"dataset_name\"],\n",
+    "        hyperparameters[\"dataset_config_name\"],\n",
+    "        split=\"train[5%:]\",\n",
+    "        cache_dir=hyperparameters[\"cache_dir\"],\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load tokenizer\n",
+    "Nearly every NLP task begins with a tokenizer. A tokenizer converts your text data into a format (token) that can be processed by the NLP model.\n",
+    "The following cell loads a tokenizer for GPT-2 using [AutoTokenizer.from_pretrained()](https://huggingface.co/docs/transformers/v4.19.4/en/autoclass_tutorial#autotokenizer)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer_kwargs = {\n",
+    "    \"cache_dir\": hyperparameters[\"cache_dir\"],\n",
+    "}\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\", **tokenizer_kwargs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Preprocess data\n",
+    "\n",
+    "The following two cells set up a function to run the tokenizer and group texts into chunks smaller than the block size."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    tok_logger = transformers.utils.logging.get_logger(\"transformers.tokenization_utils_base\")\n",
+    "\n",
+    "    with CaptureLogger(tok_logger) as cl:\n",
+    "        output = tokenizer(examples[text_column_name])\n",
+    "        # clm input could be much much longer than block_size\n",
+    "        if \"Token indices sequence length is longer than the\" in cl.out:\n",
+    "            tok_logger.warning(\n",
+    "                \"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model.\"\n",
+    "            )\n",
+    "    return output\n",
+    "\n",
+    "\n",
+    "# Main data processing function to concatenate all texts \n",
+    "# from the dataset and generate chunks of texts smaller than block_size.\n",
+    "def group_texts(examples):\n",
+    "    # Concatenate all texts.\n",
+    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
+    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+    "    # customize this part to your needs.\n",
+    "    if total_length >= block_size:\n",
+    "        total_length = (total_length // block_size) * block_size\n",
+    "        # Split by chunks of max_len.\n",
+    "        result = {\n",
+    "            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+    "            for k, t in concatenated_examples.items()\n",
+    "        }\n",
+    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "column_names = raw_datasets[\"train\"].column_names\n",
+    "text_column_name = \"text\" if \"text\" in column_names else column_names[0]\n",
+    "\n",
+    "# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function\n",
+    "tok_logger = transformers.utils.logging.get_logger(\"transformers.tokenization_utils_base\")\n",
+    "\n",
+    "tokenized_datasets = raw_datasets.map(\n",
+    "    tokenize_function,\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    "    remove_columns=column_names,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")\n",
+    "\n",
+    "\n",
+    "block_size = tokenizer.model_max_length\n",
+    "if block_size > 1024:\n",
+    "    logger.warning(\n",
+    "        f\"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). \"\n",
+    "        \"Picking 1024 instead. You can change that default value by passing --block_size xxx.\"\n",
+    "    )\n",
+    "    block_size = 1024\n",
+    "else:\n",
+    "    if block_size > tokenizer.model_max_length:\n",
+    "        logger.warning(\n",
+    "            f\"The block_size passed ({block_size}) is larger than the maximum length for the model\"\n",
+    "            f\"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}.\"\n",
+    "        )\n",
+    "    block_size = min(block_size, tokenizer.model_max_length)\n",
+    "\n",
+    "lm_datasets = tokenized_datasets.map(\n",
+    "    group_texts,\n",
+    "    batched=True,\n",
+    "    #     num_proc=args.preprocessing_num_workers,\n",
+    "    desc=f\"Grouping texts in chunks of {block_size}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set additional hyperparameters and S3 paths for mapping the train and validation datasets properly depending on the phase (training or validation) of the training job in each epoch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if hyperparameters[\"do_train\"]:\n",
+    "    if \"train\" not in tokenized_datasets:\n",
+    "        raise ValueError(\"--do_train requires a train dataset\")\n",
+    "    train_dataset = lm_datasets[\"train\"]\n",
+    "\n",
+    "\n",
+    "if hyperparameters[\"do_eval\"]:\n",
+    "    if \"validation\" not in tokenized_datasets:\n",
+    "        raise ValueError(\"--do_eval requires a validation dataset\")\n",
+    "    eval_dataset = lm_datasets[\"validation\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_dataset_location = None\n",
+    "validation_dataset_location = None\n",
+    "\n",
+    "\n",
+    "if hyperparameters[\"do_train\"]:\n",
+    "    train_dataset.to_json(\"./training.json\")\n",
+    "    training_dataset_location = \"s3://{}/dataset/train/\".format(default_bucket)\n",
+    "\n",
+    "if hyperparameters[\"do_eval\"]:\n",
+    "    eval_dataset.to_json(\"./validation.json\")\n",
+    "    validation_dataset_location = \"s3://{}/dataset/validation/\".format(default_bucket)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if training_dataset_location is not None:\n",
+    "    command = \"aws s3 cp ./training.json {}\".format(training_dataset_location)\n",
+    "    os.system(command)\n",
+    "\n",
+    "if validation_dataset_location is not None:\n",
+    "    command = \"aws s3 cp ./validation.json {}\".format(validation_dataset_location)\n",
+    "    os.system(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if hyperparameters[\"do_train\"]:\n",
+    "    command = \"rm ./training.json\"\n",
+    "    os.system(command)\n",
+    "\n",
+    "if hyperparameters[\"do_eval\"]:\n",
+    "    command = \"rm ./validation.json\"\n",
+    "    os.system(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%store training_dataset_location\n",
+    "%store validation_dataset_location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%store"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Specify Amazon S3 bucket paths"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here you need to specify the paths for training data to be used by your job. The bucket used must be in the same region as where training will run. In the cells above you downloaded the GLUE/SST2 training and validation split datasets and uploaded the json files in an S3 bucket in your account. This example will train on those json files.\n",
+    "\n",
+    "After you successfully run this example tensor parallel training job, you can modify the S3 bucket to where your own dataset is stored."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%store -r training_dataset_location\n",
+    "%store -r validation_dataset_location\n",
+    "\n",
+    "# if you're bringing your own data, uncomment the following lines and specify the locations there\n",
+    "# training_dataset_location = YOUR_S3_BUCKET/training\n",
+    "# validation_dataset_location = YOUR_S3_BUCKET/validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3_train_bucket = training_dataset_location\n",
+    "s3_test_bucket = validation_dataset_location"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following S3 bucket will store the output artifacts of the training job. You can modify this as needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3_output_bucket = f\"s3://sagemaker-{region}-{account}/smp-tensorparallel-outputdir/\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Data Channels for SageMaker Training Using Amazon S3\n",
+    "\n",
+    "In this step, define SageMaker training data channels to the S3 buckets.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set use_fsx to False by default\n",
+    "# Set below var to True if you want to use fsx (see next cell)\n",
+    "use_fsx = False\n",
+    "if not use_fsx:\n",
+    "    if s3_train_bucket != None:\n",
+    "        train = sagemaker.inputs.TrainingInput(\n",
+    "            s3_train_bucket, distribution=\"FullyReplicated\", s3_data_type=\"S3Prefix\"\n",
+    "        )\n",
+    "        data_channels = {\"train\": train}\n",
+    "    else:\n",
+    "        data_channels = {\"train\": mock_data}\n",
+    "    if s3_test_bucket != None:\n",
+    "        test = sagemaker.inputs.TrainingInput(\n",
+    "            s3_test_bucket, distribution=\"FullyReplicated\", s3_data_type=\"S3Prefix\"\n",
+    "        )\n",
+    "        data_channels[\"test\"] = test\n",
+    "    else:\n",
+    "        data_channels[\"test\"] = mock_data\n",
+    "    print(data_channels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## (Optional) Set Up and Use Amazon FSx for Data Channels and Checkpoints\n",
+    "\n",
+    "While the previous option of using Amazon S3 is easier to setup, using an FSx can be beneficial for performance when dealing with large input sizes and large model sizes. If you are using models above 13B, checkpointing should be done using FSx. \n",
+    "\n",
+    "Please see the instructions from [Distributed Training of Mask-RCNN in Amazon SageMaker Using FSx](https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/distributed_tensorflow_mask_rcnn/mask-rcnn-scriptmode-fsx.ipynb) to create an FSx Lustre file system and import the dataset from the S3 bucket to your FSx file system. Note that the FSx file system must be created in a private subnet with internet gateway to ensure that training job has access to the internet. For general guidance on setting an FSx Lustre file system as data input channel, see [Configure Data Input Channel to Use Amazon FSx for Lustre](https://docs.aws.amazon.com/sagemaker/latest/dg/model-access-training-data.html#model-access-training-data-fsx)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instructions obtained from:\n",
+    "# https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/distributed_tensorflow_mask_rcnn/mask-rcnn-scriptmode-fsx.ipynb\n",
+    "\n",
+    "if use_fsx:\n",
+    "    from sagemaker.inputs import FileSystemInput\n",
+    "\n",
+    "    # Specify FSx Lustre file system id.\n",
+    "    file_system_id = \"<your-file-system-id>\"\n",
+    "\n",
+    "    # Specify the SG and subnet used by the FSX, these are passed to SM Estimator so jobs use this as well\n",
+    "    fsx_security_group_id = \"<your-security-group-id>\"\n",
+    "    fsx_subnet = \"<your-subnet>\"\n",
+    "\n",
+    "    # Specify directory path for input data on the file system.\n",
+    "    # You need to provide normalized and absolute path below.\n",
+    "    # Your mount name can be provided by you when creating fsx, or generated automatically.\n",
+    "    # You can find this mount_name on the FSX page in console.\n",
+    "    # Example of fsx generated mount_name: \"3x5lhbmv\"\n",
+    "    base_path = \"<your-mount-name>\"\n",
+    "\n",
+    "    # Specify your file system type.\n",
+    "    file_system_type = \"FSxLustre\"\n",
+    "\n",
+    "    train = FileSystemInput(\n",
+    "        file_system_id=file_system_id,\n",
+    "        file_system_type=file_system_type,\n",
+    "        directory_path=base_path,\n",
+    "        file_system_access_mode=\"rw\",\n",
+    "    )\n",
+    "\n",
+    "    data_channels = {\"train\": train, \"test\": train}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set hyperparameters, metric definitions, and MPI options\n",
+    "The following `hyperparameters` dictionary passes arguments to the training script (`train.py`) and set the model parallel configuration when creating the training job.\n",
+    "\n",
+    "You can also add custom `mpi` flags. By default, we have `--mca btl_vader_single_copy_mechanism none` to remove unnecessary logs.\n",
+    "\n",
+    "Next, we add a base metric definitions to enable the metric upload in SageMaker. You can add any further metric definitions.\n",
+    "\n",
+    "Note that we add the `sharded_data_parallel_degree` parameter to the `hyperparameter` dictionary. This will be parsed and used when we configure a SageMaker PyTorch estimator to activate sharded data parallelism."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyperparameters = {\n",
+    "    \"max_steps\": 100,\n",
+    "    \"seed\": 12345,\n",
+    "    \"fp16\": 0,\n",
+    "    \"bf16\": 1,\n",
+    "    \"lr\": 2.0e-4,\n",
+    "    \"lr_decay_iters\": 125000,\n",
+    "    \"min_lr\": 0.00001,\n",
+    "    \"lr-decay-style\": \"linear\",\n",
+    "    \"warmup\": 0.01,\n",
+    "    \"num_kept_checkpoints\": 5,\n",
+    "    \"checkpoint_freq\": 200,\n",
+    "    \"logging_freq\": 1,\n",
+    "    \"save_final_full_model\": 0,\n",
+    "    \"delayed_param\": 1,\n",
+    "    \"use_distributed_transformer\": 1,\n",
+    "    \"offload_activations\": 0,\n",
+    "    \"activation_loading_horizon\": 4,\n",
+    "    \"gradient_accumulation\": 1,\n",
+    "    \"validation_freq\": 200,\n",
+    "    \"train_batch_size\": 10,\n",
+    "    \"val_batch_size\": 4,\n",
+    "    \"flash_attention\": 1,\n",
+    "    \"zipped_data\": 0,\n",
+    "    \"epochs\": 100,\n",
+    "    # parameters for sharded data parallelism\n",
+    "    \"sharded_data_parallel_degree\": 32,\n",
+    "}\n",
+    "\n",
+    "if use_fsx:\n",
+    "    # make sure to update paths for training-dir and test-dir based on the paths of datasets in fsx\n",
+    "    # If you want to resume training, set checkpoint-dir to the same path as a previous job.\n",
+    "    SM_TRAIN_DIR = \"/opt/ml/input/data/train\"\n",
+    "    hyperparameters[\"checkpoint-dir\"] = f\"{SM_TRAIN_DIR}/checkpointdir-job2\"\n",
+    "    hyperparameters[\"model-dir\"] = f\"{SM_TRAIN_DIR}/modeldir-job2\"\n",
+    "    hyperparameters[\"training-dir\"] = f\"{SM_TRAIN_DIR}/datasets/pytorch_gpt2/train_synthetic\"\n",
+    "    hyperparameters[\"test-dir\"] = f\"{SM_TRAIN_DIR}/datasets/pytorch_gpt2/val_synthetic\"\n",
+    "\n",
+    "# The checkpoint path (hyperparameters['checkpoint-dir'] or checkpoint_s3_uri) is not unique per job.\n",
+    "# You need to modify as needed for different runs.\n",
+    "# If same path is used for unrelated runs, this may increase time when downloading unnecessary checkpoints,\n",
+    "# and cause conflicts when loading checkpoints.\n",
+    "\n",
+    "mpioptions = \"-x NCCL_DEBUG=WARN -x SMDEBUG_LOG_LEVEL=ERROR \"\n",
+    "mpioptions += (\n",
+    "    \"-x SMP_DISABLE_D2D=1 -x SMP_D2D_GPU_BUFFER_SIZE_BYTES=1 -x SMP_NCCL_THROTTLE_LIMIT=1 \"\n",
+    ")\n",
+    "mpioptions += \"-x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1\"\n",
+    "\n",
+    "metric_definitions = [\n",
+    "    {\"Name\": \"base_metric\", \"Regex\": \"<><><><><><>\"}\n",
+    "]  # Add your custom metric definitions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the model configuration by choose one from `gpt2-30b`, `gpt2-xl` and `gpt2-small`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_config = \"gpt2-30b\"\n",
+    "\n",
+    "if model_config == \"gpt2-30b\":\n",
+    "    model_params = {\n",
+    "        \"max_context_width\": 2048,\n",
+    "        \"hidden_width\": 7168,\n",
+    "        \"num_layers\": 48,\n",
+    "        \"num_heads\": 64,\n",
+    "    }\n",
+    "\n",
+    "elif model_config == \"gpt2-xl\":\n",
+    "    # 1.5B\n",
+    "    model_params = {\n",
+    "        \"max_context_width\": 2048,\n",
+    "        \"hidden_width\": 1536,\n",
+    "        \"num_layers\": 48,\n",
+    "        \"num_heads\": 24,\n",
+    "    }\n",
+    "elif model_config == \"gpt2-small\":\n",
+    "    model_params = {\n",
+    "        \"max_context_width\": 2048,\n",
+    "        \"hidden_width\": 768,\n",
+    "        \"num_layers\": 12,\n",
+    "        \"num_heads\": 12,\n",
+    "    }\n",
+    "else:\n",
+    "    raise RuntimeError(\"Unknown model config\")\n",
+    "\n",
+    "for k, v in model_params.items():\n",
+    "    hyperparameters[k] = v"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Specify essential parameters for a SageMaker Training job\n",
+    "\n",
+    "Next, you use the [`SageMaker Estimator class`](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) to define a SageMaker Training Job, passing values through the following parameters for training job name, the number of EC2 instances, the instance type, and the size of the volume attached to the instances. \n",
+    "\n",
+    "* `instance_count`\n",
+    "* `instance_type`\n",
+    "* `volume_size`\n",
+    "* `base_job_name`\n",
+    "\n",
+    "### Update the type and the number of EC2 instance to use\n",
+    "\n",
+    "The instance type and the number of instances you specify to the `instance_type` and `instance_count` parameters, respectively, determine the total number of GPUs (world size).\n",
+    "\n",
+    "$$ \\text{(world size) = (the number of GPUs on a single instance)}\\times\\text{(the number of instances)}$$\n",
+    "\n",
+    "- For GPT-2 with 30-billion parameters, you need at least 16 `ml.p4d.24xlarge` instances.\n",
+    "- For GPT-2 xl, use 1 `ml.p4d.24xlarge` at least.\n",
+    "- For GPT-2 small, use 1 `ml.p3.16xlarge` at least."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "instance_type = \"ml.p4d.24xlarge\"\n",
+    "instance_count = 16\n",
+    "\n",
+    "# set to the number of GPUs on that instance\n",
+    "processes_per_host = 8"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To look up the number of GPUs of different instance types, see [Amazon EC2 Instance Types](https://aws.amazon.com/ec2/instance-types/). Use the section **Accelerated Computing** to see general purpose GPU instances. Note that, for example, a given instance type `p4d.24xlarge` has a corresponding instance type `ml.p4d.24xlarge` in SageMaker.\n",
+    "For SageMaker supported `ml` instances and cost information, see [Amazon SageMaker Pricing](https://aws.amazon.com/sagemaker/pricing/). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Specify a base job name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "machine_str = instance_type.split(\".\")[1] + instance_type.split(\".\")[2][:3]\n",
+    "sharding_degree = hyperparameters[\"sharded_data_parallel_degree\"]\n",
+    "base_job_name = (\n",
+    "    f'smp-{model_config}-{machine_str}-sdp{sharding_degree}-bs{hyperparameters[\"train_batch_size\"]}'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not use_fsx:\n",
+    "    # If you want to resume training, set checkpoint_s3_uri to the same path as a previous job.\n",
+    "    # Previous checkpoint to load must have same model config.\n",
+    "    checkpoint_bucket = f\"s3://sagemaker-{region}-{account}/\"\n",
+    "    checkpoint_s3_uri = (\n",
+    "        f\"{checkpoint_bucket}/experiments/gpt_synthetic_simpletrainer_checkpoints/{base_job_name}/\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"base_job_name: {base_job_name} checkpoint_s3_uri: {checkpoint_s3_uri}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a SageMaker PyTorch estimator\n",
+    "\n",
+    "The following cell constructs a PyTorch estimator using the parameters defined above. To see how the SageMaker APIs and functions are applied to the script, see the `train.py` file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kwargs = {}\n",
+    "if use_fsx:\n",
+    "    # Use the security group and subnet that was used to create the fsx filesystem\n",
+    "    kwargs[\"security_group_ids\"] = [fsx_security_group_id]\n",
+    "    kwargs[\"subnets\"] = [fsx_subnet]\n",
+    "\n",
+    "smp_estimator = PyTorch(\n",
+    "    entry_point=\"train.py\",\n",
+    "    source_dir=os.getcwd(),\n",
+    "    role=role,\n",
+    "    instance_type=instance_type,\n",
+    "    instance_count=instance_count,\n",
+    "    sagemaker_session=sagemaker_session,\n",
+    "    distribution={\n",
+    "        \"mpi\": {\n",
+    "            \"enabled\": True,\n",
+    "            \"processes_per_host\": processes_per_host,\n",
+    "            \"custom_mpi_options\": mpioptions,\n",
+    "        },\n",
+    "        \"smdistributed\": {\n",
+    "            \"modelparallel\": {\n",
+    "                \"enabled\": True,\n",
+    "                \"parameters\": {\n",
+    "                    \"ddp\": True,\n",
+    "                    \"skip_tracing\": True,\n",
+    "                    \"delayed_parameter_initialization\": hyperparameters[\"delayed_param\"] > 0,\n",
+    "                    \"offload_activations\": hyperparameters[\"offload_activations\"] > 0,\n",
+    "                    \"activation_loading_horizon\": hyperparameters[\"activation_loading_horizon\"],\n",
+    "                    \"sharded_data_parallel_degree\": hyperparameters[\"sharded_data_parallel_degree\"],\n",
+    "                    \"fp16\": hyperparameters[\"fp16\"] > 0,\n",
+    "                    \"bf16\": hyperparameters[\"bf16\"] > 0,\n",
+    "                    # partitions is a required param in the current SM SDK so it needs to be passed,\n",
+    "                    \"partitions\": 1,\n",
+    "                },\n",
+    "            }\n",
+    "        },\n",
+    "    },\n",
+    "    framework_version=\"1.13\",\n",
+    "    py_version=\"py39\",\n",
+    "    output_path=s3_output_bucket,\n",
+    "    checkpoint_s3_uri=checkpoint_s3_uri if not use_fsx else None,\n",
+    "    checkpoint_local_path=hyperparameters[\"checkpoint-dir\"] if use_fsx else None,\n",
+    "    metric_definitions=metric_definitions,\n",
+    "    hyperparameters=hyperparameters,\n",
+    "    debugger_hook_config=False,\n",
+    "    disable_profiler=True,\n",
+    "    base_job_name=base_job_name,\n",
+    "    **kwargs,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, run the `estimator.fit` method to launch the SageMaker training job of the GPT-2 model with sharded data parallelism."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "smp_estimator.fit(inputs=data_channels, logs=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Accessing the Training Logs\n",
+    "\n",
+    "You can access the training logs from [Amazon CloudWatch](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/WhatIsCloudWatch.html). Make sure to look at the logs of **algo-1** because that is the main node whose output stream has the training job logs.\n",
+    "\n",
+    "You can use CloudWatch to track SageMaker GPU and memory utilization during training and inference. To view the metrics and logs that SageMaker writes to CloudWatch, see [SageMaker Jobs and Endpoint Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html#cloudwatch-metrics-jobs) in the Amazon SageMaker Developer Guide.\n",
+    "\n",
+    "If you are a new user of CloudWatch, see [Getting Started with Amazon CloudWatch](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/GettingStarted.html). \n",
+    "\n",
+    "For additional information on monitoring and analyzing Amazon SageMaker training jobs, see [Monitor and Analyze Training Jobs Using Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html).\n",
+    "\n",
+    "## Deploying Trained Model for Inference\n",
+    "\n",
+    "In most cases, a trained model can be deployed on a single device for inference because inference only requires a small amount of memory. You can use the SMP API to create a single, unified model after training: the [smp.DistributedModel.save_model()](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_tensorflow.html#smp.DistributedModel.save_model) method for TensorFlow, and the [smp.save()](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_pytorch.html#apis-for-saving-and-loading) function for PyTorch.\n",
+    "\n",
+    "After you build and train your models, you can deploy them to get predictions in one of two ways:\n",
+    "\n",
+    "* To set up a persistent endpoint to get predictions from your models, use SageMaker hosting services. For an overview on deploying a single model or multiple models with SageMaker hosting services, see [Deploy a Model on SageMaker Hosting Services](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-deployment.html#how-it-works-hosting).\n",
+    "* To get predictions for an entire dataset, use SageMaker batch transform. For an overview on deploying a model with SageMaker Batch Transform, see [Get Inferences for an Entire Dataset with Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-batch.html).\n",
+    "\n",
+    "To learn more about deploying models for inference using SageMaker, see [Deploy Models for Inference](https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html). \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Notebook CI Test Results\n",
+    "\n",
+    "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-east-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-east-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-west-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ca-central-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/sa-east-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-3/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-central-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-north-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-southeast-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-southeast-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-northeast-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-northeast-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
+    "\n",
+    "![This badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-south-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "hide_input": false,
+  "instance_type": "ml.t3.medium",
+  "kernelspec": {
+   "display_name": "conda_pytorch_p310",
+   "language": "python",
+   "name": "conda_pytorch_p310"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple-sharded-data-parallel.ipynb b/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple-sharded-data-parallel.ipynb
deleted file mode 100644
index 935f27a163..0000000000
--- a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple-sharded-data-parallel.ipynb
+++ /dev/null
@@ -1,824 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "# Train GPT-2 with near-linear scaling using Sharded Data Parallelism technique in SageMaker Model Parallelism Library"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n",
-    "\n",
-    "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-west-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "In this notebook, you learn how to train Hugging Face Transformers GPT-2 model with the [Sharded Data Parallelism](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-sharded-data-parallelism.html) technique in [SageMaker's Model Parallelism library (SMP)](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html) with PyTorch 1.12 and [openwebtext dataset](https://huggingface.co/datasets/openwebtext) on SageMaker. \n",
-    "\n",
-    "The GPT-2 model was proposed by OpenAI in paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The original GPT-2 is a large transformer-based language model with 1.5 billion parameters. In this notebook, you can experiment with the model parameters to achieve different model sizes. This notebook uses the [Hugging Face Transformers GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html) implementation with the SMP integration.\n",
-    "\n",
-    "Sharded Data Parallelism is a distributed training technique that splits the model parameters, gradients, and optimizer states across GPUs in a data parallel group. It is purpose-built for extreme-scale models and leverages Amazon in-house [MiCS](https://arxiv.org/pdf/2205.00119.pdf) technology which achieves near linear-scaling efficiency. For large models that cannot fit into a single GPU, we recommend to train with Sharded Data Parallelism technique with [Activation Checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-activation-checkpointing.html) and [Activation Offloading](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-activation-offloading.html) in SMP first before leveraging other techniques such as Tensor or Pipeline Parallelism.\n",
-    "\n",
-    "\n",
-    "This notebook depends on the following files:\n",
-    "\n",
-    "- `train_gpt_simple.py`: The entrypoint script passed to the Hugging Face estimator in this notebook. This script is responsible for end to end training of the GPT-2 model with SMP. You can follow the comments to learn where the SMP API is used.\n",
-    "- `data_pipeline.py`: Datapipeline function to prepare the training data.\n",
-    "- `data_prep_512.py`: This downloads and preprocess the openwebtext dataset.\n",
-    "- `learining_rate.py`: Functions for learning rate schedule.\n",
-    "- `requirements.txt`: This installs the dependencies, including huggingface transformers.\n",
-    "- `memory_tracker.py`: Functions to track memory usage.\n",
-    "- `sharded_data_parallel_checkpoint.py`: Checkpoint utils for Sharded Data Parallelism\n",
-    "\n",
-    "### Additional Resources\n",
-    "- To learn more about the SageMaker model parallelism library, see [Model Parallel Distributed Training with SageMaker Distributed](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html).\n",
-    "\n",
-    "- To learn more about using the SageMaker Python SDK with PyTorch, see [Using PyTorch with the SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html).\n",
-    "\n",
-    "- To learn more about launching a training job in Amazon SageMaker with your own training image, see [Use Your Own Training Algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html).\n",
-    "\n",
-    "- To learn more about Sharded Data Parallelism, check out [the document](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-sharded-data-parallelism.html) or [this blog post](https://www.amazon.science/blog/near-linear-scaling-of-gigantic-model-training-on-aws).\n",
-    "\n",
-    "### Prerequisites\n",
-    "You must create an S3 bucket to store the input data for training. This bucket must be located in the same AWS Region that you choose to launch your training job. To learn more, see [Creating a bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html) in the *Amazon S3 documentation*.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## Amazon SageMaker Initialization\n",
-    "\n",
-    "Run the following cell to import SageMaker modules and retrieve information of your current SageMaker work environment, such as your AWS account ID, the AWS Region, and the ARN of your Amazon SageMaker execution role. Upgrade SageMaker SDK to the latest version. \n",
-    "\n",
-    "**NOTE:** This step might require a kernel restart."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "%pip install --upgrade sagemaker\n",
-    "%pip install sagemaker-experiments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "import os\n",
-    "\n",
-    "import boto3\n",
-    "import sagemaker\n",
-    "from sagemaker import get_execution_role\n",
-    "from sagemaker.pytorch import PyTorch\n",
-    "\n",
-    "role = (\n",
-    "    get_execution_role()\n",
-    ")  # provide a pre-existing role ARN as an alternative to creating a new role\n",
-    "print(f\"SageMaker Execution Role: {role}\")\n",
-    "\n",
-    "client = boto3.client(\"sts\")\n",
-    "account = client.get_caller_identity()[\"Account\"]\n",
-    "print(f\"AWS account: {account}\")\n",
-    "\n",
-    "session = boto3.session.Session()\n",
-    "region = session.region_name\n",
-    "print(f\"AWS region: {region}\")\n",
-    "\n",
-    "sm_boto_client = boto3.client(\"sagemaker\")\n",
-    "sagemaker_session = sagemaker.session.Session(boto_session=session)\n",
-    "\n",
-    "# get default bucket\n",
-    "default_bucket = sagemaker_session.default_bucket()\n",
-    "print()\n",
-    "print(\"Default bucket for this session: \", default_bucket)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## Prepare your dataset\n",
-    "We recommend that you use the [openwebtext](https://huggingface.co/datasets/viewer/?dataset=openwebtext) in this notebook. You can use the `data_prep_512.py` script to download and preprocess the dataset. The entire process takes 3 to 4 hours, so it is recommended to run the script in a separate SageMaker notebook instance and upload the processed data into your S3 bucket. The script requires `datasets` and `transformers` libraries. Run the following commands to install the libraries.\n",
-    "```\n",
-    "pip install datasets\n",
-    "pip install transformers\n",
-    "```\n",
-    "You can also use your own dataset. Modify the `data_pipeline.py` to serve your purposes.\n",
-    "\n",
-    "**NOTE:** In this notebook, we provide a wiki corpus dataset sample for the `amazon-sagemaker-examples` repository's continuous integration (CI) test. This sample data is small and not meant to train for convergence."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## Specify Amazon S3 Bucket Paths"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "You need to specify S3 paths for training and test datasets for your training job. The S3 bucket must be in the same region as where the training job runs.\n",
-    "\n",
-    "Replace the `None` values at the top of the following cell with your S3 bucket and prefix of your preprocessed data. For example, if your training data is in `s3://DOC-EXAMPLE-BUCKET/training`, specify it to the `s3_train_bucket` variable in string format.\n",
-    "\n",
-    "If you proceed with `None` values for both `s3_train_bucket` and `s3_test_bucket`, then the notebook downloads the wiki corpus mock dataset from the public SageMaker S3 bucket (`s3://sagemaker-sample-files`) and upload it to your default bucket. This is intended for CI."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "s3_train_bucket = None  # Specify your S3 bucket path for training dataset\n",
-    "s3_test_bucket = None  # Specify your S3 bucket path for test dataset\n",
-    "\n",
-    "\n",
-    "# For CI, integration test of the repo pipeline\n",
-    "if s3_train_bucket == None:\n",
-    "    # Download some mock data from a public bucket in us-east-1\n",
-    "    s3 = boto3.resource(\"s3\")\n",
-    "    bucket_name = \"sagemaker-sample-files\"\n",
-    "    # Phase 1 pretraining\n",
-    "    prefix = \"datasets/binary/bert/hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en_abstract\"\n",
-    "\n",
-    "    local_dir = \"/tmp/data\"\n",
-    "    bucket = s3.Bucket(bucket_name)\n",
-    "\n",
-    "    for obj in bucket.objects.filter(Prefix=prefix):\n",
-    "        target = os.path.join(local_dir, obj.key)\n",
-    "        if not os.path.exists(os.path.dirname(target)):\n",
-    "            os.makedirs(os.path.dirname(target))\n",
-    "        bucket.download_file(obj.key, target)\n",
-    "\n",
-    "    # upload to default bucket\n",
-    "    mock_data = sagemaker_session.upload_data(\n",
-    "        path=os.path.join(local_dir, prefix),\n",
-    "        bucket=sagemaker_session.default_bucket(),\n",
-    "        key_prefix=prefix,\n",
-    "    )\n",
-    "    running_ci = True\n",
-    "else:\n",
-    "    running_ci = False"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "The following cell sets up the output path to store artifacts of the training job. You can modify this as needed."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "s3_output_location = f\"s3://{default_bucket}/output/\"\n",
-    "print(f\"your output data storage path: s3://{default_bucket}/output/\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## Define Data Channels for SageMaker Training Using Amazon S3\n",
-    "\n",
-    "In this step, you define SageMaker training data channels using the above buckets.  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# Set use_fsx to False by default\n",
-    "# Set below var to True if you want to use fsx (see next cell)\n",
-    "use_fsx = False\n",
-    "if not use_fsx:\n",
-    "    if s3_train_bucket != None:\n",
-    "        train = sagemaker.inputs.TrainingInput(\n",
-    "            s3_train_bucket, distribution=\"FullyReplicated\", s3_data_type=\"S3Prefix\"\n",
-    "        )\n",
-    "        data_channels = {\"train\": train}\n",
-    "    else:\n",
-    "        data_channels = {\"train\": mock_data}\n",
-    "    if s3_test_bucket != None:\n",
-    "        test = sagemaker.inputs.TrainingInput(\n",
-    "            s3_test_bucket, distribution=\"FullyReplicated\", s3_data_type=\"S3Prefix\"\n",
-    "        )\n",
-    "        data_channels[\"test\"] = test\n",
-    "    else:\n",
-    "        data_channels[\"test\"] = mock_data\n",
-    "    print(data_channels)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## (Optional) Set Up and Use Amazon FSx for Data Channels and Checkpoints\n",
-    "\n",
-    "While the previous option of using Amazon S3 is easier to setup, using an FSx can be beneficial for performance when dealing with large input sizes and large model sizes. If you are using models above 13B, checkpointing should be done using FSx. \n",
-    "\n",
-    "Please see the instructions from [Distributed Training of Mask-RCNN in Amazon SageMaker Using FSx](https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/distributed_tensorflow_mask_rcnn/mask-rcnn-scriptmode-fsx.ipynb) to create an FSx Lustre file system and import the dataset from the S3 bucket to your FSx file system. Note that the FSx file system must be created in a private subnet with internet gateway to ensure that training job has access to the internet. For general guidance on setting an FSx Lustre file system as data input channel, see [Configure Data Input Channel to Use Amazon FSx for Lustre](https://docs.aws.amazon.com/sagemaker/latest/dg/model-access-training-data.html#model-access-training-data-fsx)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# Instructions obtained from:\n",
-    "# https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/distributed_tensorflow_mask_rcnn/mask-rcnn-scriptmode-fsx.ipynb\n",
-    "\n",
-    "if use_fsx:\n",
-    "    from sagemaker.inputs import FileSystemInput\n",
-    "\n",
-    "    # Specify FSx Lustre file system id.\n",
-    "    file_system_id = \"<your-file-system-id>\"\n",
-    "\n",
-    "    # Specify the SG and subnet used by the FSX, these are passed to SM Estimator so jobs use this as well\n",
-    "    fsx_security_group_id = \"<your-security-group-id>\"\n",
-    "    fsx_subnet = \"<your-subnet>\"\n",
-    "\n",
-    "    # Specify directory path for input data on the file system.\n",
-    "    # You need to provide normalized and absolute path below.\n",
-    "    # Your mount name can be provided by you when creating fsx, or generated automatically.\n",
-    "    # You can find this mount_name on the FSX page in console.\n",
-    "    # Example of fsx generated mount_name: \"3x5lhbmv\"\n",
-    "    base_path = \"<your-mount-name>\"\n",
-    "\n",
-    "    # Specify your file system type.\n",
-    "    file_system_type = \"FSxLustre\"\n",
-    "\n",
-    "    train = FileSystemInput(\n",
-    "        file_system_id=file_system_id,\n",
-    "        file_system_type=file_system_type,\n",
-    "        directory_path=base_path,\n",
-    "        file_system_access_mode=\"rw\",\n",
-    "    )\n",
-    "\n",
-    "    data_channels = {\"train\": train, \"test\": train}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## Set Up Hyperparameters, Metric Definitions, and MPI Options\n",
-    "The following `hyperparameters` dictionary passes arguments to the training script (`train_gpt_simple.py`) and set the model parallel configuration when creating the training job.\n",
-    "\n",
-    "You can also add custom `mpi` flags. By default, we have `--mca btl_vader_single_copy_mechanism none` to remove unnecessary logs.\n",
-    "\n",
-    "Next, we add a base metric definitions to enable the metric upload in SageMaker. You can add any further metric definitions.\n",
-    "\n",
-    "Note that we added the `sharded_data_parallel_degree` parameter to the `hyperparameter` dictionary. This will be parsed and used when we configure a SageMaker PyTorch estimator to activate sharded data parallelism."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "hyperparameters = {\n",
-    "    \"max_steps\": 100,\n",
-    "    \"seed\": 12345,\n",
-    "    \"fp16\": 0,\n",
-    "    \"bf16\": 1,\n",
-    "    \"lr\": 2.0e-4,\n",
-    "    \"lr_decay_iters\": 125000,\n",
-    "    \"min_lr\": 0.00001,\n",
-    "    \"lr-decay-style\": \"linear\",\n",
-    "    \"warmup\": 0.01,\n",
-    "    \"num_kept_checkpoints\": 5,\n",
-    "    \"checkpoint_freq\": 200,\n",
-    "    \"logging_freq\": 1,\n",
-    "    \"save_final_full_model\": 0,\n",
-    "    \"delayed_param\": 1,\n",
-    "    \"use_distributed_transformer\": 1,\n",
-    "    \"offload_activations\": 1,\n",
-    "    \"activation_loading_horizon\": 4,\n",
-    "    \"gradient_accumulation\": 1,\n",
-    "    \"validation_freq\": 200,\n",
-    "    \"train_batch_size\": 4,\n",
-    "    \"val_batch_size\": 4,\n",
-    "    # parameters for sharded data parallelism\n",
-    "    \"sharded_data_parallel_degree\": 2,\n",
-    "}\n",
-    "\n",
-    "if use_fsx:\n",
-    "    # make sure to update paths for training-dir and test-dir based on the paths of datasets in fsx\n",
-    "    # If you want to resume training, set checkpoint-dir to the same path as a previous job.\n",
-    "    SM_TRAIN_DIR = \"/opt/ml/input/data/train\"\n",
-    "    hyperparameters[\"checkpoint-dir\"] = f\"{SM_TRAIN_DIR}/checkpointdir-job2\"\n",
-    "    hyperparameters[\"model-dir\"] = f\"{SM_TRAIN_DIR}/modeldir-job2\"\n",
-    "    hyperparameters[\"training-dir\"] = f\"{SM_TRAIN_DIR}/datasets/pytorch_gpt2/train_synthetic\"\n",
-    "    hyperparameters[\"test-dir\"] = f\"{SM_TRAIN_DIR}/datasets/pytorch_gpt2/val_synthetic\"\n",
-    "\n",
-    "# The checkpoint path (hyperparameters['checkpoint-dir'] or checkpoint_s3_uri) is not unique per job.\n",
-    "# You need to modify as needed for different runs.\n",
-    "# If same path is used for unrelated runs, this may increase time when downloading unnecessary checkpoints,\n",
-    "# and cause conflicts when loading checkpoints.\n",
-    "\n",
-    "mpioptions = \"-x NCCL_DEBUG=WARN -x SMDEBUG_LOG_LEVEL=ERROR \"\n",
-    "mpioptions += (\n",
-    "    \"-x SMP_DISABLE_D2D=1 -x SMP_D2D_GPU_BUFFER_SIZE_BYTES=1 -x SMP_NCCL_THROTTLE_LIMIT=1 \"\n",
-    ")\n",
-    "mpioptions += \"-x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1\"\n",
-    "\n",
-    "metric_definitions = [\n",
-    "    {\"Name\": \"base_metric\", \"Regex\": \"<><><><><><>\"}\n",
-    "]  # Add your custom metric definitions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "Set the model configuration. Specify one from `gpt2-30b`, `gpt2-xl` and `gpt2-small`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "model_config = \"gpt2-xl\"\n",
-    "\n",
-    "if model_config == \"gpt2-30b\":\n",
-    "    model_params = {\n",
-    "        \"max_context_width\": 2048,\n",
-    "        \"hidden_width\": 7168,\n",
-    "        \"num_layers\": 48,\n",
-    "        \"num_heads\": 64,\n",
-    "    }\n",
-    "\n",
-    "elif model_config == \"gpt2-xl\":\n",
-    "    # 1.5B\n",
-    "    model_params = {\n",
-    "        \"max_context_width\": 2048,\n",
-    "        \"hidden_width\": 1536,\n",
-    "        \"num_layers\": 48,\n",
-    "        \"num_heads\": 24,\n",
-    "    }\n",
-    "elif model_config == \"gpt2-small\":\n",
-    "    model_params = {\n",
-    "        \"max_context_width\": 2048,\n",
-    "        \"hidden_width\": 768,\n",
-    "        \"num_layers\": 12,\n",
-    "        \"num_heads\": 12,\n",
-    "    }\n",
-    "else:\n",
-    "    raise RuntimeError(\"Unknown model config\")\n",
-    "\n",
-    "for k, v in model_params.items():\n",
-    "    hyperparameters[k] = v"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## Specify Essential Parameters for a SageMaker Training Job\n",
-    "\n",
-    "Next, you use the [`SageMaker Estimator class`](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) to define a SageMaker Training Job, passing values through the following parameters for training job name, the number of EC2 instances, the instance type, and the size of the volume attached to the instances. \n",
-    "\n",
-    "* `instance_count`\n",
-    "* `instance_type`\n",
-    "* `volume_size`\n",
-    "* `base_job_name`\n",
-    "\n",
-    "### Update the Type and Number of EC2 Instance to Use\n",
-    "\n",
-    "The instance type and the number of instances you specify to the `instance_type` and `instance_count` parameters, respectively, determine the total number of GPUs (world size).\n",
-    "\n",
-    "$$ \\text{(world size) = (the number of GPUs on a single instance)}\\times\\text{(the number of instances)}$$"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "instance_type = \"ml.p4d.24xlarge\"\n",
-    "\n",
-    "# for gpt2 30b, you need at least 16 p4d instances\n",
-    "# gpt2 xl can be run using a single p4d at the minimum\n",
-    "# gpt2 small can be run using a single p3.16 at the minimum\n",
-    "# instance_count = 16\n",
-    "instance_count = 2\n",
-    "\n",
-    "# set to the number of GPUs on that instance\n",
-    "processes_per_host = 8"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "To look up the number of GPUs of different instance types, see [Amazon EC2 Instance Types](https://aws.amazon.com/ec2/instance-types/). Use the section **Accelerated Computing** to see general purpose GPU instances. Note that, for example, a given instance type `p4d.24xlarge` has a corresponding instance type `ml.p4d.24xlarge` in SageMaker.\n",
-    "For SageMaker supported `ml` instances and cost information, see [Amazon SageMaker Pricing](https://aws.amazon.com/sagemaker/pricing/). "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "### Attach an EBS Volume to the Training Instance\n",
-    "The volume size you specify in `volume_size` must be larger than your input data size. In this example, the volume size is set to 500GB."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "volume_size = 500"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Note:** For NVMe-type memory attached instances, you don't need to specify `volume_size`. The `volume_size` parameter attaches EBS volumes to instance types that don't have instance storage. For more information, see [Tips and Considerations for Setting Up Storage Paths](https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html#model-train-storage-tips-considerations)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "### Specify a Base Job Name"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "machine_str = instance_type.split(\".\")[1] + instance_type.split(\".\")[2][:3]\n",
-    "sharding_degree = hyperparameters[\"sharded_data_parallel_degree\"]\n",
-    "base_job_name = (\n",
-    "    f'smp-{model_config}-{machine_str}-sdp{sharding_degree}-bs{hyperparameters[\"train_batch_size\"]}'\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "if not use_fsx:\n",
-    "    # If you want to resume training, set checkpoint_s3_uri to the same path as a previous job.\n",
-    "    # Previous checkpoint to load must have same model config.\n",
-    "    checkpoint_bucket = f\"s3://sagemaker-{region}-{account}/\"\n",
-    "    checkpoint_s3_uri = (\n",
-    "        f\"{checkpoint_bucket}/experiments/gpt_synthetic_simpletrainer_checkpoints/{base_job_name}/\"\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "print(f\"base_job_name: {base_job_name} checkpoint_s3_uri: {checkpoint_s3_uri}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "### Create a SageMaker PyTorch Estimator\n",
-    "\n",
-    "The following cell constructs a PyTorch estimator using the parameters defined above. To see how the SageMaker APIs and functions are applied to the script, see the `train_gpt_simple.py` file."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "kwargs = {}\n",
-    "if use_fsx:\n",
-    "    # Use the security group and subnet that was used to create the fsx filesystem\n",
-    "    kwargs[\"security_group_ids\"] = [fsx_security_group_id]\n",
-    "    kwargs[\"subnets\"] = [fsx_subnet]\n",
-    "\n",
-    "smp_estimator = PyTorch(\n",
-    "    entry_point=\"train_gpt_simple.py\",\n",
-    "    source_dir=os.getcwd(),\n",
-    "    role=role,\n",
-    "    instance_type=instance_type,\n",
-    "    volume_size=volume_size,\n",
-    "    instance_count=instance_count,\n",
-    "    sagemaker_session=sagemaker_session,\n",
-    "    distribution={\n",
-    "        \"mpi\": {\n",
-    "            \"enabled\": True,\n",
-    "            \"processes_per_host\": processes_per_host,\n",
-    "            \"custom_mpi_options\": mpioptions,\n",
-    "        },\n",
-    "        \"smdistributed\": {\n",
-    "            \"modelparallel\": {\n",
-    "                \"enabled\": True,\n",
-    "                \"parameters\": {\n",
-    "                    \"ddp\": True,\n",
-    "                    \"skip_tracing\": True,\n",
-    "                    \"delayed_parameter_initialization\": hyperparameters[\"delayed_param\"] > 0,\n",
-    "                    \"offload_activations\": hyperparameters[\"offload_activations\"] > 0,\n",
-    "                    \"activation_loading_horizon\": hyperparameters[\"activation_loading_horizon\"],\n",
-    "                    \"sharded_data_parallel_degree\": hyperparameters[\"sharded_data_parallel_degree\"],\n",
-    "                    \"fp16\": hyperparameters[\"fp16\"] > 0,\n",
-    "                    \"bf16\": hyperparameters[\"bf16\"] > 0,\n",
-    "                    # partitions is a required param in the current SM SDK so it needs to be passed,\n",
-    "                    \"partitions\": 1,\n",
-    "                },\n",
-    "            }\n",
-    "        },\n",
-    "    },\n",
-    "    framework_version=\"1.12\",\n",
-    "    py_version=\"py38\",\n",
-    "    output_path=s3_output_location,\n",
-    "    checkpoint_s3_uri=checkpoint_s3_uri if not use_fsx else None,\n",
-    "    checkpoint_local_path=hyperparameters[\"checkpoint-dir\"] if use_fsx else None,\n",
-    "    metric_definitions=metric_definitions,\n",
-    "    hyperparameters=hyperparameters,\n",
-    "    debugger_hook_config=False,\n",
-    "    disable_profiler=True,\n",
-    "    base_job_name=base_job_name,\n",
-    "    **kwargs,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "Finally, run the `estimator.fit` method to launch the SageMaker training job of the GPT-2 model with sharded data parallelism."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "smp_estimator.fit(inputs=data_channels, logs=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## Accessing the Training Logs\n",
-    "\n",
-    "You can access the training logs from [Amazon CloudWatch](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/WhatIsCloudWatch.html). Make sure to look at the logs of **algo-1** because that is the main node whose output stream has the training job logs.\n",
-    "\n",
-    "You can use CloudWatch to track SageMaker GPU and memory utilization during training and inference. To view the metrics and logs that SageMaker writes to CloudWatch, see [SageMaker Jobs and Endpoint Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html#cloudwatch-metrics-jobs) in the Amazon SageMaker Developer Guide.\n",
-    "\n",
-    "If you are a new user of CloudWatch, see [Getting Started with Amazon CloudWatch](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/GettingStarted.html). \n",
-    "\n",
-    "For additional information on monitoring and analyzing Amazon SageMaker training jobs, see [Monitor and Analyze Training Jobs Using Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html).\n",
-    "\n",
-    "## Deploying Trained Model for Inference\n",
-    "\n",
-    "In most cases, a trained model can be deployed on a single device for inference because inference only requires a small amount of memory. You can use the SMP API to create a single, unified model after training: the [smp.DistributedModel.save_model()](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_tensorflow.html#smp.DistributedModel.save_model) method for TensorFlow, and the [smp.save()](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_pytorch.html#apis-for-saving-and-loading) function for PyTorch.\n",
-    "\n",
-    "After you build and train your models, you can deploy them to get predictions in one of two ways:\n",
-    "\n",
-    "* To set up a persistent endpoint to get predictions from your models, use SageMaker hosting services. For an overview on deploying a single model or multiple models with SageMaker hosting services, see [Deploy a Model on SageMaker Hosting Services](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-deployment.html#how-it-works-hosting).\n",
-    "* To get predictions for an entire dataset, use SageMaker batch transform. For an overview on deploying a model with SageMaker Batch Transform, see [Get Inferences for an Entire Dataset with Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-batch.html).\n",
-    "\n",
-    "To learn more about deploying models for inference using SageMaker, see [Deploy Models for Inference](https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html). \n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Notebook CI Test Results\n",
-    "\n",
-    "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n",
-    "\n",
-    "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-east-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-east-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-west-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ca-central-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/sa-east-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-3/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-central-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-north-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-southeast-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-southeast-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-northeast-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-northeast-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n",
-    "\n",
-    "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-south-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple-sharded-data-parallel.ipynb)\n"
-   ]
-  }
- ],
- "metadata": {
-  "hide_input": false,
-  "instance_type": "ml.t3.medium",
-  "kernelspec": {
-   "display_name": "conda_pytorch_p36",
-   "language": "python",
-   "name": "conda_pytorch_p36"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb b/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb
deleted file mode 100644
index fb241b47af..0000000000
--- a/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple.ipynb
+++ /dev/null
@@ -1,1290 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Train GPT-2 with PyTorch 1.12 and Tensor Parallelism Using the SageMaker Model Parallelism Library"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n",
-    "\n",
-    "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-west-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This notebook walks you through how to use the SageMaker model parallelism (SMP) library. You'll learn how to train the GPT-2 model with SageMaker's model parallelism.\n",
-    "\n",
-    "The GPT-2 model was proposed by OpenAI in paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The original GPT-2 is a large transformer-based language model with 1.5 billion parameters. In this notebook, you can experiment with the model parameters to achieve different model sizes. This notebook uses the [Hugging Face Transformers GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html) implementation with the SMP integration. \n",
-    "\n",
-    "This notebook depends on the following files and folders:\n",
-    "\n",
-    "- `train_gpt_simple.py`: This is an entrypoint script that is passed to the Hugging Face estimator in the notebook instructions. This script is responsible for end to end training of the GPT-2 model with SMP. The script has additional comments at places where the SMP API is used.\n",
-    "- `data_pipeline.py`: This contains the datapipeline function to prepare the training data.\n",
-    "- `learining_rate.py`: This contains the functions for learning rate schedule.\n",
-    "- `requirements.txt`: This will install the dependencies, like the right version of huggingface transformers.\n",
-    "- `data_prep_512.py`: This will download and preprocess the openwebtext dataset.\n",
-    "- `memory_tracker.py`: This contains the functions to track memory usage.\n",
-    "- `sharded_data_parallel_checkpoint.py`: This contains checkpoint util functions for sharded data parallelism\n",
-    "\n",
-    "### Additional Resources\n",
-    "If you are a new user of Amazon SageMaker, you may find the following helpful to learn more about SMP and using SageMaker with PyTorch.\n",
-    "\n",
-    "- To learn more about the SageMaker model parallelism library, see [Model Parallel Distributed Training with SageMaker Distributed](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html).\n",
-    "\n",
-    "- To learn more about using the SageMaker Python SDK with PyTorch, see [Using PyTorch with the SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html).\n",
-    "\n",
-    "- To learn more about launching a training job in Amazon SageMaker with your own training image, see [Use Your Own Training Algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html).\n",
-    "\n",
-    "### Prerequisites\n",
-    "You must create an S3 bucket to store the input data for training. This bucket must be located in the same AWS Region that you choose to launch your training job. To learn more, see [Creating a bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html) in the *Amazon S3 documentation*.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Amazon SageMaker Initialization\n",
-    "\n",
-    "Run the following cell to import SageMaker modules and retrieve information of your current SageMaker work environment, such as your AWS account ID, the AWS Region, and the ARN of your Amazon SageMaker execution role."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Upgrade SageMaker SDK to the latest version. \n",
-    "\n",
-    "**NOTE:** This step might require a kernel restart."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install --upgrade sagemaker\n",
-    "%pip install sagemaker-experiments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "import os\n",
-    "\n",
-    "import boto3\n",
-    "import sagemaker\n",
-    "from sagemaker import get_execution_role\n",
-    "from sagemaker.pytorch import PyTorch\n",
-    "\n",
-    "role = (\n",
-    "    get_execution_role()\n",
-    ")  # provide a pre-existing role ARN as an alternative to creating a new role\n",
-    "print(f\"SageMaker Execution Role: {role}\")\n",
-    "\n",
-    "client = boto3.client(\"sts\")\n",
-    "account = client.get_caller_identity()[\"Account\"]\n",
-    "print(f\"AWS account: {account}\")\n",
-    "\n",
-    "session = boto3.session.Session()\n",
-    "region = session.region_name\n",
-    "print(f\"AWS region: {region}\")\n",
-    "\n",
-    "sm_boto_client = boto3.client(\"sagemaker\")\n",
-    "sagemaker_session = sagemaker.session.Session(boto_session=session)\n",
-    "\n",
-    "# get default bucket\n",
-    "default_bucket = sagemaker_session.default_bucket()\n",
-    "print()\n",
-    "print(\"Default bucket for this session: \", default_bucket)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Prepare your dataset\n",
-    "[openwebtext](https://huggingface.co/datasets/viewer/?dataset=openwebtext) is a dataset that we recommend for training. You can use the script `data_prep_512.py` to download and preprocess the dataset. The entire process takes 3 to 4 hours, so it is recommended to run the script in a separate SageMaker notebook instance and upload the processed data into your S3 bucket. The script will require `datasets` and `transformers` to run, you could use the following commands to install the libraries:\n",
-    "```\n",
-    "pip install datasets\n",
-    "pip install transformers\n",
-    "```\n",
-    "You can also use your own dataset. Modify the `data_pipeline.py` to serve your purposes.\n",
-    "\n",
-    "**NOTE:** In this notebook, we provide a wiki corpus dataset sample for the `amazon-sagemaker-examples` repository's continuous integration (CI) test. This sample data is small and not meant to train for convergence."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Specify Amazon S3 Bucket Paths"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You need to specify S3 paths for training and test datasets for your training job. The S3 bucket must be in the same region as where the training job will run.\n",
-    "\n",
-    "Replace the `None` values at the top of the following cell with your S3 bucket and prefix of your preprocessed data. For example, if your training data is in `s3://DOC-EXAMPLE-BUCKET/training`, specify it to `s3_train_bucket`.\n",
-    "\n",
-    "If you proceed with `None` values for both `s3_train_bucket` and `s3_test_bucket`, then the notebook will download the wiki corpus mock dataset from the public SageMaker S3 bucket (`s3://sagemaker-example-files-prod-{region}`) and upload it to your default bucket. This is intended for CI."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s3_train_bucket = None  # Specify your S3 bucket path for training dataset\n",
-    "s3_test_bucket = None  # Specify your S3 bucket path for test dataset\n",
-    "\n",
-    "\n",
-    "# For CI, integration test of the repo pipeline\n",
-    "if s3_train_bucket == None:\n",
-    "    # Download some mock data from a public bucket in us-east-1\n",
-    "    s3 = boto3.resource(\"s3\")\n",
-    "    bucket_name = f\"sagemaker-example-files-prod-{region}\"\n",
-    "    # Phase 1 pretraining\n",
-    "    prefix = \"datasets/binary/bert/hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en_abstract\"\n",
-    "\n",
-    "    local_dir = \"/tmp/data\"\n",
-    "    bucket = s3.Bucket(bucket_name)\n",
-    "\n",
-    "    for obj in bucket.objects.filter(Prefix=prefix):\n",
-    "        target = os.path.join(local_dir, obj.key)\n",
-    "        if not os.path.exists(os.path.dirname(target)):\n",
-    "            os.makedirs(os.path.dirname(target))\n",
-    "        bucket.download_file(obj.key, target)\n",
-    "\n",
-    "    # upload to default bucket\n",
-    "    mock_data = sagemaker_session.upload_data(\n",
-    "        path=os.path.join(local_dir, prefix),\n",
-    "        bucket=sagemaker_session.default_bucket(),\n",
-    "        key_prefix=prefix,\n",
-    "    )\n",
-    "    running_ci = True\n",
-    "else:\n",
-    "    running_ci = False"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The following cell sets up the output path to store artifacts of the training job. You can modify this as needed."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s3_output_location = f\"s3://{default_bucket}/output/\"\n",
-    "print(f\"your output data will be stored in: s3://{default_bucket}/output/\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Define Data Channels for SageMaker Training Using Amazon S3\n",
-    "\n",
-    "In this step, you define SageMaker training data channels using the above buckets.  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set use_fsx to False by default\n",
-    "# Set below var to True if you want to use fsx (see next cell)\n",
-    "use_fsx = False\n",
-    "if not use_fsx:\n",
-    "    if s3_train_bucket != None:\n",
-    "        train = sagemaker.inputs.TrainingInput(\n",
-    "            s3_train_bucket, distribution=\"FullyReplicated\", s3_data_type=\"S3Prefix\"\n",
-    "        )\n",
-    "        data_channels = {\"train\": train}\n",
-    "    else:\n",
-    "        data_channels = {\"train\": mock_data}\n",
-    "    if s3_test_bucket != None:\n",
-    "        test = sagemaker.inputs.TrainingInput(\n",
-    "            s3_test_bucket, distribution=\"FullyReplicated\", s3_data_type=\"S3Prefix\"\n",
-    "        )\n",
-    "        data_channels[\"test\"] = test\n",
-    "    else:\n",
-    "        data_channels[\"test\"] = mock_data\n",
-    "    print(data_channels)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## (Optional) Set Up and Use Amazon FSx for Data Channels and Checkpoints\n",
-    "\n",
-    "While the previous option of using Amazon S3 is easier to setup, using an FSx can be beneficial for performance when dealing with large input sizes and large model sizes. If you are using models above 13B, checkpointing should be done using FSx. \n",
-    "\n",
-    "Please see the instructions from [Distributed Training of Mask-RCNN in Amazon SageMaker Using FSx](https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/distributed_tensorflow_mask_rcnn/mask-rcnn-scriptmode-fsx.ipynb) to create an FSx Lustre file system and import the dataset from the S3 bucket to your FSx file system. Note that the FSx file system must be created in a private subnet with internet gateway to ensure that training job has access to the internet. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Instructions obtained from:\n",
-    "# https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/distributed_tensorflow_mask_rcnn/mask-rcnn-scriptmode-fsx.ipynb\n",
-    "\n",
-    "if use_fsx:\n",
-    "    from sagemaker.inputs import FileSystemInput\n",
-    "\n",
-    "    # Specify FSx Lustre file system id.\n",
-    "    file_system_id = \"<your-file-system-id>\"\n",
-    "\n",
-    "    # Specify the SG and subnet used by the FSX, these are passed to SM Estimator so jobs use this as well\n",
-    "    fsx_security_group_id = \"<your-security-group-id>\"\n",
-    "    fsx_subnet = \"<your-subnet>\"\n",
-    "\n",
-    "    # Specify directory path for input data on the file system.\n",
-    "    # You need to provide normalized and absolute path below.\n",
-    "    # Your mount name can be provided by you when creating fsx, or generated automatically.\n",
-    "    # You can find this mount_name on the FSX page in console.\n",
-    "    # Example of fsx generated mount_name: \"3x5lhbmv\"\n",
-    "    base_path = \"<your-mount-name>\"\n",
-    "\n",
-    "    # Specify your file system type.\n",
-    "    file_system_type = \"FSxLustre\"\n",
-    "\n",
-    "    train = FileSystemInput(\n",
-    "        file_system_id=file_system_id,\n",
-    "        file_system_type=file_system_type,\n",
-    "        directory_path=base_path,\n",
-    "        file_system_access_mode=\"rw\",\n",
-    "    )\n",
-    "\n",
-    "    data_channels = {\"train\": train, \"test\": train}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Set Up Hyperparameters, Metric Definitions, and MPI Options\n",
-    "The following `hyperparameters` dictionary is to pass arguments to the training script (`train_gpt_simple.py`) and set the model parallel configuration when creating the training job.\n",
-    "\n",
-    "You can also add custom mpi flags. By default, we have `--mca btl_vader_single_copy_mechanism none` to remove unnecessary logs.\n",
-    "\n",
-    "Next, we add a base metric definitions to enable the metric upload in SageMaker. You can add any further metric definitions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hyperparameters = {\n",
-    "    \"max_steps\": 100,\n",
-    "    \"seed\": 12345,\n",
-    "    \"fp16\": 1,\n",
-    "    \"lr\": 2.0e-4,\n",
-    "    \"lr_decay_iters\": 125000,\n",
-    "    \"min_lr\": 0.00001,\n",
-    "    \"lr-decay-style\": \"linear\",\n",
-    "    \"warmup\": 0.01,\n",
-    "    \"num_kept_checkpoints\": 5,\n",
-    "    \"checkpoint_freq\": 200,\n",
-    "    \"logging_freq\": 1,\n",
-    "    \"use_wiki_data\": 1,\n",
-    "    # below flag loads model and optimizer state from checkpoint_s3_uri\n",
-    "    # 'load_partial': 1,\n",
-    "}\n",
-    "\n",
-    "# Add parameters required by SMP config.\n",
-    "# Refer https://sagemaker.readthedocs.io/en/stable/api/training/smd_model_parallel_general.html\n",
-    "# for details.\n",
-    "hyperparameters.update(\n",
-    "    {\n",
-    "        \"save_final_full_model\": 0,\n",
-    "        \"manual_partition\": 1,\n",
-    "        \"skip_full_optimizer\": 1,\n",
-    "        \"shard_optimizer_state\": 1,\n",
-    "        \"activation_checkpointing\": 1,\n",
-    "        \"activation_strategy\": \"each\",\n",
-    "        \"optimize\": \"speed\",\n",
-    "    }\n",
-    ")\n",
-    "\n",
-    "if not running_ci:\n",
-    "    # those flags are used when training with the openwebtext dataset\n",
-    "    hyperparameters[\"zipped_data\"] = 0\n",
-    "    hyperparameters[\"validation_freq\"] = 20\n",
-    "    hyperparameters[\"use_wiki_data\"] = 0\n",
-    "\n",
-    "if use_fsx:\n",
-    "    # make sure to update paths for training-dir and test-dir based on the paths of datasets in fsx\n",
-    "    # If you want to resume training, set checkpoint-dir to the same path as a previous job.\n",
-    "    SM_TRAIN_DIR = \"/opt/ml/input/data/train\"\n",
-    "    hyperparameters[\"checkpoint-dir\"] = f\"{SM_TRAIN_DIR}/checkpointdir-job2\"\n",
-    "    hyperparameters[\"model-dir\"] = f\"{SM_TRAIN_DIR}/modeldir-job2\"\n",
-    "    hyperparameters[\"training-dir\"] = f\"{SM_TRAIN_DIR}/datasets/pytorch_gpt2/train_synthetic\"\n",
-    "    hyperparameters[\"test-dir\"] = f\"{SM_TRAIN_DIR}/datasets/pytorch_gpt2/val_synthetic\"\n",
-    "\n",
-    "# The checkpoint path (hyperparameters['checkpoint-dir'] or checkpoint_s3_uri) is not unique per job.\n",
-    "# You need to modify as needed for different runs.\n",
-    "# If same path is used for unrelated runs, this may increase time when downloading unnecessary checkpoints,\n",
-    "# and cause conflicts when loading checkpoints.\n",
-    "\n",
-    "\n",
-    "mpioptions = \"-x NCCL_DEBUG=WARN -x SMDEBUG_LOG_LEVEL=ERROR \"\n",
-    "mpioptions += (\n",
-    "    \"-x SMP_DISABLE_D2D=1 -x SMP_D2D_GPU_BUFFER_SIZE_BYTES=1 -x SMP_NCCL_THROTTLE_LIMIT=1 \"\n",
-    ")\n",
-    "mpioptions += \"-x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1\"\n",
-    "\n",
-    "metric_definitions = [\n",
-    "    {\"Name\": \"base_metric\", \"Regex\": \"<><><><><><>\"}\n",
-    "]  # Add your custom metric definitions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Set the model configuration. Specify one from `gpt2-30b`, `gpt2-xl` and `gpt2-small`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_config = \"gpt2-small\"\n",
-    "\n",
-    "if model_config == \"gpt2-30b\":\n",
-    "    model_params = {\n",
-    "        \"max_context_width\": 512,\n",
-    "        \"hidden_width\": 7168,\n",
-    "        \"num_layers\": 48,\n",
-    "        \"num_heads\": 64,\n",
-    "        \"tensor_parallel_degree\": 8,\n",
-    "        \"pipeline_parallel_degree\": 1,\n",
-    "        \"train_batch_size\": 5,\n",
-    "        \"val_batch_size\": 5,\n",
-    "        \"prescaled_batch\": 0,\n",
-    "    }\n",
-    "\n",
-    "elif model_config == \"gpt2-xl\":\n",
-    "    # 1.5B\n",
-    "    model_params = {\n",
-    "        \"max_context_width\": 512,\n",
-    "        \"hidden_width\": 1536,\n",
-    "        \"num_layers\": 48,\n",
-    "        \"num_heads\": 24,\n",
-    "        \"tensor_parallel_degree\": 4,\n",
-    "        \"pipeline_parallel_degree\": 1,\n",
-    "        \"train_batch_size\": 2,\n",
-    "        \"val_batch_size\": 4,\n",
-    "        \"prescaled_batch\": 0,\n",
-    "    }\n",
-    "elif model_config == \"gpt2-small\":\n",
-    "    model_params = {\n",
-    "        \"max_context_width\": 512,\n",
-    "        \"hidden_width\": 768,\n",
-    "        \"num_layers\": 12,\n",
-    "        \"num_heads\": 12,\n",
-    "        \"tensor_parallel_degree\": 4,\n",
-    "        \"pipeline_parallel_degree\": 1,\n",
-    "        \"train_batch_size\": 2,\n",
-    "        \"val_batch_size\": 4,\n",
-    "        \"prescaled_batch\": 0,\n",
-    "    }\n",
-    "else:\n",
-    "    raise RuntimeError(\"Unknown model config\")\n",
-    "\n",
-    "for k, v in model_params.items():\n",
-    "    hyperparameters[k] = v"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Specify Essential Parameters for a SageMaker Training Job\n",
-    "\n",
-    "Next, you will use the [`SageMaker Estimator API`](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) to define a SageMaker Training Job, passing values through the following parameters for training job name, the number of EC2 instances, the instance type, and the size of the volume attached to the instances. \n",
-    "\n",
-    "* `instance_count`\n",
-    "* `instance_type`\n",
-    "* `volume_size`\n",
-    "* `base_job_name`\n",
-    "\n",
-    "### Update the Type and Number of EC2 Instance to Use\n",
-    "\n",
-    "The instance type and the number of instances you specify to the `instance_type` and `instance_count` parameters, respectively, will determine the total number of GPUs (world size).\n",
-    "\n",
-    "$$ \\text{(world size) = (the number of GPUs on a single instance)}\\times\\text{(the number of instances)}$$"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "instance_type = \"ml.p3.16xlarge\"\n",
-    "\n",
-    "# for gpt2 30b, you need at least 16 p4d instances\n",
-    "# gpt2 xl can be run using a single p4d at the minimum\n",
-    "# gpt2 small can be run using a single p3.16 at the minimum\n",
-    "instance_count = 1\n",
-    "\n",
-    "# set to the number of GPUs on that instance\n",
-    "processes_per_host = 8"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To look up the number of GPUs of different instance types, see [Amazon EC2 Instance Types](https://aws.amazon.com/ec2/instance-types/). Use the section **Accelerated Computing** to see general purpose GPU instances. Note that, for example, a given instance type `p4d.24xlarge` has a corresponding instance type `ml.p4d.24xlarge` in SageMaker.\n",
-    "For SageMaker supported `ml` instances and cost information, see [Amazon SageMaker Pricing](https://aws.amazon.com/sagemaker/pricing/). "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Attach an EBS Volume to the Training Instance\n",
-    "The volume size you specify in `volume_size` must be larger than your input data size. In this example, the volume size is set to 500GB."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "volume_size = 500"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Specify a Base Job Name"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "machine_str = instance_type.split(\".\")[1] + instance_type.split(\".\")[2][:3]\n",
-    "pp_degree = hyperparameters[\"pipeline_parallel_degree\"]\n",
-    "tp_degree = hyperparameters[\"tensor_parallel_degree\"]\n",
-    "base_job_name = f'smp-{model_config}-{machine_str}-tp{tp_degree}-pp{pp_degree}-bs{hyperparameters[\"train_batch_size\"]}'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if not use_fsx:\n",
-    "    # If you want to resume training, set checkpoint_s3_uri to the same path as a previous job.\n",
-    "    # Previous checkpoint to load must have same model config.\n",
-    "    checkpoint_bucket = f\"s3://sagemaker-{region}-{account}/\"\n",
-    "    checkpoint_s3_uri = (\n",
-    "        f\"{checkpoint_bucket}/experiments/gpt_synthetic_simpletrainer_checkpoints/{base_job_name}/\"\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(f\"base_job_name: {base_job_name} checkpoint_s3_uri: {checkpoint_s3_uri}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Create a SageMaker PyTorch Estimator\n",
-    "\n",
-    "The following cell constructs a PyTorch estimator using the parameters defined above. To see how the SageMaker tensor parallelism modules and functions are applied to the script, see the `train_gpt_simple.py` file."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kwargs = {}\n",
-    "if use_fsx:\n",
-    "    # Use the security group and subnet that was used to create the fsx filesystem\n",
-    "    kwargs[\"security_group_ids\"] = [fsx_security_group_id]\n",
-    "    kwargs[\"subnets\"] = [fsx_subnet]\n",
-    "\n",
-    "smp_estimator = PyTorch(\n",
-    "    entry_point=\"train_gpt_simple.py\",\n",
-    "    source_dir=os.getcwd(),\n",
-    "    role=role,\n",
-    "    instance_type=instance_type,\n",
-    "    volume_size=volume_size,\n",
-    "    instance_count=instance_count,\n",
-    "    sagemaker_session=sagemaker_session,\n",
-    "    distribution={\n",
-    "        \"mpi\": {\n",
-    "            \"enabled\": True,\n",
-    "            \"processes_per_host\": processes_per_host,\n",
-    "            \"custom_mpi_options\": mpioptions,\n",
-    "        },\n",
-    "        \"smdistributed\": {\n",
-    "            \"modelparallel\": {\n",
-    "                \"enabled\": True,\n",
-    "                \"parameters\": {\n",
-    "                    \"ddp\": True,\n",
-    "                    \"tensor_parallel_degree\": hyperparameters[\"tensor_parallel_degree\"],\n",
-    "                    # partitions is a required param in the current SM SDK so it needs to be passed,\n",
-    "                    # these two map to the same config\n",
-    "                    \"partitions\": hyperparameters[\"pipeline_parallel_degree\"],\n",
-    "                    \"shard_optimizer_state\": hyperparameters[\"shard_optimizer_state\"] > 0,\n",
-    "                    \"prescaled_batch\": hyperparameters[\"prescaled_batch\"] > 0,\n",
-    "                    \"fp16\": hyperparameters[\"fp16\"] > 0,\n",
-    "                    \"optimize\": hyperparameters[\"optimize\"],\n",
-    "                    \"auto_partition\": False if hyperparameters[\"manual_partition\"] else True,\n",
-    "                    \"default_partition\": 0,\n",
-    "                    \"optimize\": hyperparameters[\"optimize\"],\n",
-    "                },\n",
-    "            }\n",
-    "        },\n",
-    "    },\n",
-    "    framework_version=\"1.12\",\n",
-    "    py_version=\"py38\",\n",
-    "    output_path=s3_output_location,\n",
-    "    checkpoint_s3_uri=checkpoint_s3_uri if not use_fsx else None,\n",
-    "    checkpoint_local_path=hyperparameters[\"checkpoint-dir\"] if use_fsx else None,\n",
-    "    metric_definitions=metric_definitions,\n",
-    "    hyperparameters=hyperparameters,\n",
-    "    debugger_hook_config=False,\n",
-    "    disable_profiler=True,\n",
-    "    base_job_name=base_job_name,\n",
-    "    **kwargs,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, run the estimator to launch the SageMaker training job of GPT2 model with tensor parallelism."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "smp_estimator.fit(inputs=data_channels, logs=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Accessing the Training Logs\n",
-    "\n",
-    "You can access the training logs from [Amazon CloudWatch](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/WhatIsCloudWatch.html). Make sure to look at the logs of **algo-1** because that is the main node whose output stream will have the training job logs.\n",
-    "\n",
-    "You can use CloudWatch to track SageMaker GPU and memory utilization during training and inference. To view the metrics and logs that SageMaker writes to CloudWatch, see [SageMaker Jobs and Endpoint Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html#cloudwatch-metrics-jobs) in the Amazon SageMaker Developer Guide.\n",
-    "\n",
-    "If you are a new user of CloudWatch, see [Getting Started with Amazon CloudWatch](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/GettingStarted.html). \n",
-    "\n",
-    "For additional information on monitoring and analyzing Amazon SageMaker training jobs, see [Monitor and Analyze Training Jobs Using Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html).\n",
-    "\n",
-    "## Deploying Trained Model for Inference\n",
-    "\n",
-    "In most cases, a trained model can be deployed on a single device for inference because inference only requires a small amount of memory. You can use the SMP API to create a single, unified model after training: the [smp.DistributedModel.save_model()](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_tensorflow.html#smp.DistributedModel.save_model) method for TensorFlow, and the [smp.save()](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_pytorch.html#apis-for-saving-and-loading) function for PyTorch.\n",
-    "\n",
-    "After you build and train your models, you can deploy them to get predictions in one of two ways:\n",
-    "\n",
-    "* To set up a persistent endpoint to get predictions from your models, use SageMaker hosting services. For an overview on deploying a single model or multiple models with SageMaker hosting services, see [Deploy a Model on SageMaker Hosting Services](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-deployment.html#how-it-works-hosting).\n",
-    "* To get predictions for an entire dataset, use SageMaker batch transform. For an overview on deploying a model with SageMaker Batch Transform, see [Get Inferences for an Entire Dataset with Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-batch.html).\n",
-    "\n",
-    "To learn more about deploying models for inference using SageMaker, see [Deploy Models for Inference](https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html). \n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Notebook CI Test Results\n",
-    "\n",
-    "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n",
-    "\n",
-    "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-east-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-east-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/us-west-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ca-central-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/sa-east-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-west-3/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-central-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/eu-north-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-southeast-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-southeast-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-northeast-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-northeast-2/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n",
-    "\n",
-    "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://h75twx4l60.execute-api.us-west-2.amazonaws.com/sagemaker-nb/ap-south-1/training|distributed_training|pytorch|model_parallel|gpt2|smp-train-gpt-simple.ipynb)\n"
-   ]
-  }
- ],
- "metadata": {
-  "availableInstances": [
-   {
-    "_defaultOrder": 0,
-    "_isFastLaunch": true,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 4,
-    "name": "ml.t3.medium",
-    "vcpuNum": 2
-   },
-   {
-    "_defaultOrder": 1,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 8,
-    "name": "ml.t3.large",
-    "vcpuNum": 2
-   },
-   {
-    "_defaultOrder": 2,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 16,
-    "name": "ml.t3.xlarge",
-    "vcpuNum": 4
-   },
-   {
-    "_defaultOrder": 3,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 32,
-    "name": "ml.t3.2xlarge",
-    "vcpuNum": 8
-   },
-   {
-    "_defaultOrder": 4,
-    "_isFastLaunch": true,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 8,
-    "name": "ml.m5.large",
-    "vcpuNum": 2
-   },
-   {
-    "_defaultOrder": 5,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 16,
-    "name": "ml.m5.xlarge",
-    "vcpuNum": 4
-   },
-   {
-    "_defaultOrder": 6,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 32,
-    "name": "ml.m5.2xlarge",
-    "vcpuNum": 8
-   },
-   {
-    "_defaultOrder": 7,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 64,
-    "name": "ml.m5.4xlarge",
-    "vcpuNum": 16
-   },
-   {
-    "_defaultOrder": 8,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 128,
-    "name": "ml.m5.8xlarge",
-    "vcpuNum": 32
-   },
-   {
-    "_defaultOrder": 9,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 192,
-    "name": "ml.m5.12xlarge",
-    "vcpuNum": 48
-   },
-   {
-    "_defaultOrder": 10,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 256,
-    "name": "ml.m5.16xlarge",
-    "vcpuNum": 64
-   },
-   {
-    "_defaultOrder": 11,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 384,
-    "name": "ml.m5.24xlarge",
-    "vcpuNum": 96
-   },
-   {
-    "_defaultOrder": 12,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 8,
-    "name": "ml.m5d.large",
-    "vcpuNum": 2
-   },
-   {
-    "_defaultOrder": 13,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 16,
-    "name": "ml.m5d.xlarge",
-    "vcpuNum": 4
-   },
-   {
-    "_defaultOrder": 14,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 32,
-    "name": "ml.m5d.2xlarge",
-    "vcpuNum": 8
-   },
-   {
-    "_defaultOrder": 15,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 64,
-    "name": "ml.m5d.4xlarge",
-    "vcpuNum": 16
-   },
-   {
-    "_defaultOrder": 16,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 128,
-    "name": "ml.m5d.8xlarge",
-    "vcpuNum": 32
-   },
-   {
-    "_defaultOrder": 17,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 192,
-    "name": "ml.m5d.12xlarge",
-    "vcpuNum": 48
-   },
-   {
-    "_defaultOrder": 18,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 256,
-    "name": "ml.m5d.16xlarge",
-    "vcpuNum": 64
-   },
-   {
-    "_defaultOrder": 19,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 384,
-    "name": "ml.m5d.24xlarge",
-    "vcpuNum": 96
-   },
-   {
-    "_defaultOrder": 20,
-    "_isFastLaunch": false,
-    "category": "General purpose",
-    "gpuNum": 0,
-    "hideHardwareSpecs": true,
-    "memoryGiB": 0,
-    "name": "ml.geospatial.interactive",
-    "supportedImageNames": [
-     "sagemaker-geospatial-v1-0"
-    ],
-    "vcpuNum": 0
-   },
-   {
-    "_defaultOrder": 21,
-    "_isFastLaunch": true,
-    "category": "Compute optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 4,
-    "name": "ml.c5.large",
-    "vcpuNum": 2
-   },
-   {
-    "_defaultOrder": 22,
-    "_isFastLaunch": false,
-    "category": "Compute optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 8,
-    "name": "ml.c5.xlarge",
-    "vcpuNum": 4
-   },
-   {
-    "_defaultOrder": 23,
-    "_isFastLaunch": false,
-    "category": "Compute optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 16,
-    "name": "ml.c5.2xlarge",
-    "vcpuNum": 8
-   },
-   {
-    "_defaultOrder": 24,
-    "_isFastLaunch": false,
-    "category": "Compute optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 32,
-    "name": "ml.c5.4xlarge",
-    "vcpuNum": 16
-   },
-   {
-    "_defaultOrder": 25,
-    "_isFastLaunch": false,
-    "category": "Compute optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 72,
-    "name": "ml.c5.9xlarge",
-    "vcpuNum": 36
-   },
-   {
-    "_defaultOrder": 26,
-    "_isFastLaunch": false,
-    "category": "Compute optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 96,
-    "name": "ml.c5.12xlarge",
-    "vcpuNum": 48
-   },
-   {
-    "_defaultOrder": 27,
-    "_isFastLaunch": false,
-    "category": "Compute optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 144,
-    "name": "ml.c5.18xlarge",
-    "vcpuNum": 72
-   },
-   {
-    "_defaultOrder": 28,
-    "_isFastLaunch": false,
-    "category": "Compute optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 192,
-    "name": "ml.c5.24xlarge",
-    "vcpuNum": 96
-   },
-   {
-    "_defaultOrder": 29,
-    "_isFastLaunch": true,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 16,
-    "name": "ml.g4dn.xlarge",
-    "vcpuNum": 4
-   },
-   {
-    "_defaultOrder": 30,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 32,
-    "name": "ml.g4dn.2xlarge",
-    "vcpuNum": 8
-   },
-   {
-    "_defaultOrder": 31,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 64,
-    "name": "ml.g4dn.4xlarge",
-    "vcpuNum": 16
-   },
-   {
-    "_defaultOrder": 32,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 128,
-    "name": "ml.g4dn.8xlarge",
-    "vcpuNum": 32
-   },
-   {
-    "_defaultOrder": 33,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 4,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 192,
-    "name": "ml.g4dn.12xlarge",
-    "vcpuNum": 48
-   },
-   {
-    "_defaultOrder": 34,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 256,
-    "name": "ml.g4dn.16xlarge",
-    "vcpuNum": 64
-   },
-   {
-    "_defaultOrder": 35,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 61,
-    "name": "ml.p3.2xlarge",
-    "vcpuNum": 8
-   },
-   {
-    "_defaultOrder": 36,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 4,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 244,
-    "name": "ml.p3.8xlarge",
-    "vcpuNum": 32
-   },
-   {
-    "_defaultOrder": 37,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 8,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 488,
-    "name": "ml.p3.16xlarge",
-    "vcpuNum": 64
-   },
-   {
-    "_defaultOrder": 38,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 8,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 768,
-    "name": "ml.p3dn.24xlarge",
-    "vcpuNum": 96
-   },
-   {
-    "_defaultOrder": 39,
-    "_isFastLaunch": false,
-    "category": "Memory Optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 16,
-    "name": "ml.r5.large",
-    "vcpuNum": 2
-   },
-   {
-    "_defaultOrder": 40,
-    "_isFastLaunch": false,
-    "category": "Memory Optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 32,
-    "name": "ml.r5.xlarge",
-    "vcpuNum": 4
-   },
-   {
-    "_defaultOrder": 41,
-    "_isFastLaunch": false,
-    "category": "Memory Optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 64,
-    "name": "ml.r5.2xlarge",
-    "vcpuNum": 8
-   },
-   {
-    "_defaultOrder": 42,
-    "_isFastLaunch": false,
-    "category": "Memory Optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 128,
-    "name": "ml.r5.4xlarge",
-    "vcpuNum": 16
-   },
-   {
-    "_defaultOrder": 43,
-    "_isFastLaunch": false,
-    "category": "Memory Optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 256,
-    "name": "ml.r5.8xlarge",
-    "vcpuNum": 32
-   },
-   {
-    "_defaultOrder": 44,
-    "_isFastLaunch": false,
-    "category": "Memory Optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 384,
-    "name": "ml.r5.12xlarge",
-    "vcpuNum": 48
-   },
-   {
-    "_defaultOrder": 45,
-    "_isFastLaunch": false,
-    "category": "Memory Optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 512,
-    "name": "ml.r5.16xlarge",
-    "vcpuNum": 64
-   },
-   {
-    "_defaultOrder": 46,
-    "_isFastLaunch": false,
-    "category": "Memory Optimized",
-    "gpuNum": 0,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 768,
-    "name": "ml.r5.24xlarge",
-    "vcpuNum": 96
-   },
-   {
-    "_defaultOrder": 47,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 16,
-    "name": "ml.g5.xlarge",
-    "vcpuNum": 4
-   },
-   {
-    "_defaultOrder": 48,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 32,
-    "name": "ml.g5.2xlarge",
-    "vcpuNum": 8
-   },
-   {
-    "_defaultOrder": 49,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 64,
-    "name": "ml.g5.4xlarge",
-    "vcpuNum": 16
-   },
-   {
-    "_defaultOrder": 50,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 128,
-    "name": "ml.g5.8xlarge",
-    "vcpuNum": 32
-   },
-   {
-    "_defaultOrder": 51,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 1,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 256,
-    "name": "ml.g5.16xlarge",
-    "vcpuNum": 64
-   },
-   {
-    "_defaultOrder": 52,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 4,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 192,
-    "name": "ml.g5.12xlarge",
-    "vcpuNum": 48
-   },
-   {
-    "_defaultOrder": 53,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 4,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 384,
-    "name": "ml.g5.24xlarge",
-    "vcpuNum": 96
-   },
-   {
-    "_defaultOrder": 54,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 8,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 768,
-    "name": "ml.g5.48xlarge",
-    "vcpuNum": 192
-   },
-   {
-    "_defaultOrder": 55,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 8,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 1152,
-    "name": "ml.p4d.24xlarge",
-    "vcpuNum": 96
-   },
-   {
-    "_defaultOrder": 56,
-    "_isFastLaunch": false,
-    "category": "Accelerated computing",
-    "gpuNum": 8,
-    "hideHardwareSpecs": false,
-    "memoryGiB": 1152,
-    "name": "ml.p4de.24xlarge",
-    "vcpuNum": 96
-   }
-  ],
-  "hide_input": false,
-  "kernelspec": {
-   "display_name": "Python 3 (PyTorch 1.13 Python 3.9 CPU Optimized)",
-   "language": "python",
-   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/pytorch-1.13-cpu-py39"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py b/training/distributed_training/pytorch/model_parallel/gpt2/train.py
similarity index 60%
rename from training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py
rename to training/distributed_training/pytorch/model_parallel/gpt2/train.py
index ac2abb3a00..e1d168f2e4 100644
--- a/training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py
+++ b/training/distributed_training/pytorch/model_parallel/gpt2/train.py
@@ -1,48 +1,41 @@
 import argparse
-import collections
 import logging
 import math
 import os
-import re
+import sys
 import time
 from concurrent.futures import ProcessPoolExecutor
+from typing import Optional
 
+import model_config as model_config_lib
 import numpy as np
 import smdistributed.modelparallel
 import smdistributed.modelparallel.torch as smp
 import torch
-import torch.nn as nn
 import torch.utils.data
 import transformers
-from data_pipeline import create_pretraining_dataloader
-from learning_rates import AnnealingLR
-from memory_tracker import memory_status, memory_status_cpu
-from sharded_data_parallel_checkpoint import get_buffer_names, get_param_shapes
-from smdistributed.modelparallel.torch.nn import FusedLayerNorm as LayerNorm
-from smdistributed.modelparallel.torch.nn.huggingface.gpt2 import (
-    translate_hf_state_dict_to_smdistributed_gpt2,
-    translate_state_dict_to_hf_gpt2,
-)
+from data_pipeline import create_pretraining_dataloader  # pylint: disable=wrong-import-order
+from learning_rates import AnnealingLR  # pylint: disable=wrong-import-order
+from memory_tracker import memory_status, memory_status_cpu  # pylint: disable=wrong-import-order
+from sdp_utils import build_param_id_to_buffer, build_param_id_to_offset, log_param_norms
+from smdistributed.modelparallel.torch.nn import FusedLayerNorm  # pylint: disable=import-error
 from torch import optim
-from torch.nn.parallel.distributed import DistributedDataParallel
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    GPT2Config,
-    default_data_collator,
-    set_seed,
-)
+from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, set_seed
 from transformers.trainer_utils import is_main_process
 
+# pylint: enable=import-error
+
+
 logging.getLogger("torch.distributed.distributed_c10d").setLevel(logging.ERROR)
-logger = logging.getLogger(__name__)
 
+if not sys.warnoptions:
+    import warnings
+
+    warnings.simplefilter("ignore")
 
-def get_learning_rate_scheduler(optimizer, args):
 
+def get_learning_rate_scheduler(optimizer, args):
+    """Get learning rate scheduler."""
     # Add linear learning rate scheduler.
     if args.lr_decay_iters is not None:
         num_iters = args.lr_decay_iters
@@ -69,38 +62,44 @@ def get_learning_rate_scheduler(optimizer, args):
 
 
 def get_param_groups_by_weight_decay(module):
+    """Get param groups."""
     weight_decay_params = {"params": []}
     no_weight_decay_params = {"params": [], "weight_decay": 0.0}
     param_ids = set()
     for module_ in module.modules():
-        if isinstance(module_, LayerNorm):
-            for p in list(module_._parameters.values()):
+        if isinstance(module_, FusedLayerNorm):
+            for p in list(module_._parameters.values()):  # pylint: disable=invalid-name
                 if p is not None and id(p) not in param_ids:
                     no_weight_decay_params["params"].append(p)
                     param_ids.add(id(p))
         else:
-            for n, p in list(module_._parameters.items()):
+            for n, p in list(  # pylint: disable=invalid-name
+                module_._parameters.items()  # pylint: disable=protected-access
+            ):
                 if p is not None and n != "bias" and id(p) not in param_ids:
                     weight_decay_params["params"].append(p)
                     param_ids.add(id(p))
-            for n, p in list(module_._parameters.items()):
+            for n, p in list(  # pylint: disable=invalid-name
+                module_._parameters.items()  # pylint: disable=protected-access
+            ):
                 if p is not None and n == "bias" and id(p) not in param_ids:
                     no_weight_decay_params["params"].append(p)
                     param_ids.add(id(p))
+    if not no_weight_decay_params["params"]:
+        return [weight_decay_params]
     return weight_decay_params, no_weight_decay_params
 
 
 # smdistributed: Define smp.step. Return any tensors needed outside.
 @smp.step
-def train_step(model, optimizer, input_ids, attention_mask, args):
+def train_step(model, input_ids, attention_mask, args):
+    """Train step."""
     if args.logits_output:
         output = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
         loss = output["loss"]
     else:
         loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)["loss"]
-
     model.backward(loss)
-
     if args.logits_output:
         return output
 
@@ -110,18 +109,20 @@ def train_step(model, optimizer, input_ids, attention_mask, args):
 # smdistributed: Define smp.step. Return any tensors needed outside.
 @smp.step
 def test_step(model, input_ids, attention_mask):
+    """Test step."""
     loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)["loss"]
     return loss
 
 
-def eval_model(model, dataloader, num_batches, use_wiki_data):
+def eval_model(model, dataloader, num_batches, use_bert_data):
+    """Eval model."""
     model = model.eval()
     n_batches = 0
     loss = 0.0
 
     with torch.no_grad():
         for batch_idx, input_data in enumerate(dataloader):
-            if use_wiki_data:
+            if use_bert_data:
                 input_ids, _, attention_mask, _, _ = input_data
             else:
                 input_ids, attention_mask = input_data
@@ -144,7 +145,7 @@ def eval_model(model, dataloader, num_batches, use_wiki_data):
     return loss, ppl
 
 
-def train(
+def train(  # pylint: disable=too-many-arguments,too-many-branches,too-many-locals,too-many-statements
     model,
     optimizer,
     lr_scheduler,
@@ -154,18 +155,21 @@ def train(
     num_params,
     total_steps,
     args,
+    param_id_to_buffer,
 ):
+    """Eval model."""
     if args.enable_memory_profiling > 0:
         memory_status_cpu(msg="before train step")
+
     model.train()
     if args.parallel_proc_data_processing:
         pool = ProcessPoolExecutor(1)
 
     dp_rank = smp.dp_rank() if not args.prescaled_batch else smp.rdp_rank()
     dp_size = smp.dp_size() if not args.prescaled_batch else smp.rdp_size()
-    data_type = "wiki" if args.use_wiki_data else "openwebtext"
+    data_type = "BERT" if args.use_bert_data else "GPT"
 
-    if args.use_wiki_data:
+    if args.use_bert_data:
         train_paths = sorted(
             [
                 os.path.join(args.training_dir, p)
@@ -202,8 +206,8 @@ def train(
     if args.validation_freq is not None:
         # load all validation examples
         if smp.rank() == 0:
-            print("Creating val dataloader")
-        if args.use_wiki_data:
+            logging.info("Creating val dataloader")
+        if args.use_bert_data:
             val_paths = sorted(
                 [
                     os.path.join(args.test_dir, p)
@@ -237,10 +241,10 @@ def train(
             data_type=data_type,
         )
         if smp.rank() == 0:
-            print("Created val dataloader")
+            logging.info("Created val dataloader of size %d.", len(val_dataloader))
 
     start = time.time()
-    throughput = None
+    throughputs = []
     to_save = {"loss": [], "val_loss": []}
     loss_metric = 0
 
@@ -252,8 +256,8 @@ def should_record():
         if smp.tp_size() > 1:
             tp_group = smp.get_tp_group()
             return 0 in tp_group
-        else:
-            return smp.rank() == 0
+
+        return smp.rank() == 0
 
     # Set the same seed for computation
     set_seed(args.seed)
@@ -281,26 +285,32 @@ def should_record():
             )
 
         if smp.rank() == 0:
-            if args.use_wiki_data:
-                print(f"Reading data from training path {train_dataloader.dataset.input_file}")
+            if args.use_bert_data:
+                logging.info(
+                    "Reading data from training path %s.", train_dataloader.dataset.input_file
+                )
             else:
-                print(f"Reading data from training path {train_dataloader.dataset.input_paths}")
+                logging.info(
+                    "Reading data from training path %s.", train_dataloader.dataset.input_paths
+                )
 
         for batch_idx, input_data in enumerate(train_dataloader):
             if batch_idx < start_batch_index:
                 if smp.rank() == 0:
-                    print(
-                        f"Resuming from saved batch index {start_batch_index}, skipping batch {batch_idx}..."
+                    logging.info(
+                        "Resuming from saved batch index %d, skipping batch %d ...",
+                        start_batch_index,
+                        batch_idx,
                     )
                 if start_batch_index == len(train_dataloader):
                     # If saving at the last batch of the file, read from the next file
                     start_batch_index = 0
                     break
                 continue
-            else:
-                start_batch_index = 0
 
-            if args.use_wiki_data:
+            start_batch_index = 0
+
+            if args.use_bert_data:
                 input_ids, _, attention_mask, _, _ = input_data
             else:
                 input_ids, attention_mask = input_data
@@ -308,28 +318,29 @@ def should_record():
             if total_steps >= args.max_steps:
                 break
 
+            torch.cuda.synchronize()
             step_start = time.time()
 
             if grad_accumulation_boundary(batch_idx - 1):
                 optimizer.zero_grad(set_to_none=True)
 
             if args.logits_output:
-                train_output = train_step(model, optimizer, input_ids, attention_mask, args)
+                train_output = train_step(model, input_ids, attention_mask, args)
                 loss_mb = train_output["loss"]
                 logits_mb = train_output["logits"]
                 if smp.tp_size() > 1:
-                    logits = torch.cat(tuple(logits_mb.outputs), dim=1)
+                    logits = torch.cat(tuple(logits_mb.outputs), dim=1)  # pylint: disable=no-member
                 else:
-                    logits = torch.cat(tuple(logits_mb.outputs), dim=0)
+                    logits = torch.cat(tuple(logits_mb.outputs), dim=0)  # pylint: disable=no-member
             else:
                 # Return value, loss_mb is a StepOutput object
-                loss_mb = train_step(model, optimizer, input_ids, attention_mask, args)
+                loss_mb = train_step(model, input_ids, attention_mask, args)
 
             # smdistributed: Average the loss across microbatches.
             loss = loss_mb.reduce_mean()
             if not args.validation_freq:
                 loss_metric = loss.item()
-            
+
             if args.enable_memory_profiling > 0:
                 memory_status_cpu("After_train_step_cpu")
                 memory_status(msg="After_train_step")
@@ -338,11 +349,11 @@ def should_record():
                 # empty the cache to avoid OOM
                 torch.cuda.empty_cache()
 
-
             if grad_accumulation_boundary(batch_idx):
-                if args.fp16:
+                if args.sharded_data_parallel_degree < 1:
+                    # as SDP does its own clipping through sdp_gradient_clipping arg in init config
                     optimizer.clip_master_grads(args.grad_clip)
-                    
+
                 optimizer.step()
                 if not (args.fp16 and optimizer.overflow):
                     lr_scheduler.step()
@@ -350,33 +361,77 @@ def should_record():
                 if args.enable_memory_profiling > 0:
                     memory_status(msg="After_opt_step")
 
+            torch.cuda.synchronize()
+            if args.log_param_norms and args.sharded_data_parallel_degree > 1:
+                log_param_norms(model, optimizer, param_id_to_buffer)
             total_steps += 1
             time_elapsed = time.time() - start
             step_time = time.time() - step_start
             sample_processed = input_ids.shape[0] * dp_size
             throughput = sample_processed / step_time
-            tokens_per_gpu = input_ids.shape[0] * input_ids.shape[1]
+            throughputs.append(throughput)
+
+            # Based on the formula in
+            # https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/
+            tflops_per_gpu = compute_tflops(
+                throughput, num_params, smp.size(), input_ids.shape[1], log = (batch_idx == 0)
+            )
+
+            if not total_steps % args.logging_freq and args.log_reduced_training_loss > 0:
+                loss_detached = loss.detach()
+                torch.distributed.all_reduce(loss_detached, group=smp.get_dp_process_group())
+                loss_scalar = loss_detached.item() / smp.dp_size()
+            else:
+                loss_scalar = loss.item()
 
-            # Based on the formula in https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/
-            tflops_per_gpu = 8 * num_params * tokens_per_gpu / step_time / 1e12
             if smp.rank() == 0 and not total_steps % args.logging_freq:
-                print(
-                    f"({int(time_elapsed)}s), Batch {total_steps - 1} Loss: {loss.item()}, Speed: {throughput} samples/sec, TFLOPS/GPU: {tflops_per_gpu}"
+                if args.sharded_data_parallel_degree > 1:
+                    gradnorm_str = f", Grad norm: {optimizer._global_grad_norm}"
+                else:
+                    gradnorm_str = ""
+                logging.info(
+                    "(%ds), Batch %d Loss: %s, Speed: %s samples/sec, TFLOPS/GPU: %s %s",
+                    int(time_elapsed),
+                    total_steps - 1,
+                    loss_scalar,
+                    throughput,
+                    tflops_per_gpu,
+                    gradnorm_str,
                 )
 
+                # Compute average throughput and tflops after 30 steps to remove
+                # high variance in initial steps
+                if len(throughputs) > 30:
+                    avg_throughput = np.average(throughputs[30:])
+                    avg_tflops = compute_tflops(
+                        avg_throughput, num_params, smp.size(), input_ids.shape[1]
+                    )
+                    logging.info(
+                        f"Batch {total_steps - 1},"
+                        + f" Running Avg Speed: {avg_throughput} samples/sec,"
+                        + f" Running Avg TFLOPS/GPU: {avg_tflops}"
+                    )
             # evaluate on validation
-            if args.validation_freq and not (total_steps % args.validation_freq):
+            if args.validation_freq and not total_steps % args.validation_freq:
+                # In GPT-NeoX runs with SDPTP, validation runs require a clean cache
+                torch.cuda.empty_cache()
                 cur_state = np.random.get_state()
                 model = model.eval()
                 val_loss, val_ppl = eval_model(
-                    model, val_dataloader, args.validation_batches, args.use_wiki_data
+                    model, val_dataloader, args.validation_batches, args.use_bert_data
                 )
                 if is_main_process(smp.rank()):
-                    print(
-                        f"({int(time.time()-start)}s) Batch {total_steps - 1} Validation loss: {val_loss}"
+                    logging.info(
+                        "(%ds) Batch %d Validation loss: %s",
+                        int(time.time() - start),
+                        total_steps - 1,
+                        val_loss,
                     )
-                    print(
-                        f"({int(time.time()-start)}s) Batch {total_steps - 1} Validation perplexity: {val_ppl}"
+                    logging.info(
+                        "(%ds) Batch %d Validation perplexity: %s",
+                        int(time.time() - start),
+                        total_steps - 1,
+                        val_ppl,
                     )
                 loss_metric = val_loss
                 if args.logits_output:
@@ -386,27 +441,29 @@ def should_record():
                     np.random.set_state(cur_state)
 
             # checkpoint
-            if not (total_steps % args.checkpoint_freq):
+            if not total_steps % args.checkpoint_freq:
                 user_content = {
                     "cli_args": args.__dict__,
                     "num_params": num_params,
                     "total_steps": total_steps,
                     "start_train_path_index": curr_train_path_index,
                     "model_config": model_config,
-                    "start_batch_index": batch_idx+1,
+                    "start_batch_index": batch_idx + 1,
                 }
-                # to reconstruct the full model
-                if args.sharded_data_parallel_degree > 1:
-                    user_content["buffer_names"] = get_buffer_names(model)
-                    user_content["param_shapes"] = get_param_shapes(model, optimizer)
+
                 user_content["lr_scheduler"] = lr_scheduler.state_dict()
-                smp.save_checkpoint(args.checkpoint_dir,
+                # buffer_names and param_shapes used to reconstruct the full model
+                # are automatically saved by smp.save_checkpoint() in user_content
+                # for partial checkpoints
+                smp.save_checkpoint(
+                    args.checkpoint_dir,
                     tag=f"total_steps{total_steps}",
                     partial=True,
                     model=model,
                     optimizer=optimizer,
                     user_content=user_content,
-                    num_kept_partial_checkpoints=args.num_kept_checkpoints)
+                    num_kept_partial_checkpoints=args.num_kept_checkpoints,
+                )
 
             if args.logits_output:
                 to_save["loss"].append(loss.item())
@@ -416,19 +473,25 @@ def should_record():
                 to_save["logits"] = logits.detach().cpu()
                 output_file = f"rank_{smp.rank()}_" + args.logits_output
                 torch.save(to_save, os.path.join(args.model_dir, output_file))
-                print(f"logits and loss saved at {os.path.join(args.model_dir, output_file)}")
+                logging.info(
+                    "logits and loss saved at %s", os.path.join(args.model_dir, output_file)
+                )
             break
 
         del train_dataloader
 
         if args.parallel_proc_data_processing:
-            s = time.time()
+            s = time.time()  # pylint: disable=invalid-name
             train_dataloader = dataset_future.result(timeout=None)
             wait_time = time.time() - s
             if wait_time > 1:
-                # TODO if this happens, we should try num_workers>1 in dataloader
-                print(
-                    f"[{smp.rank()}] Waited {wait_time} for data loader to be ready. Please check if dataloader performance can be improved to avoid these waits."
+                # TODO if this happens, we should try num_workers>1 in dataloader  # pylint: disable=fixme
+                logging.info(
+                    "[%d] Waited %s for data loader to be ready. "
+                    "Please check if dataloader performance can be "
+                    "improved to avoid these waits.",
+                    smp.rank(),
+                    wait_time,
                 )
         else:
             train_dataloader = create_pretraining_dataloader(
@@ -444,10 +507,12 @@ def should_record():
                 data_type=data_type,
             )
 
-    return total_steps, throughput, loss_metric
+    # Using median throughput across all steps, could be more robust.
+    return total_steps, np.median(throughputs) if throughputs else 0, loss_metric
 
 
-def parse_args():
+def parse_args():  # pylint: disable=too-many-statements
+    """Parse args."""
     parser = argparse.ArgumentParser()
 
     # hyperparameters sent by the client are passed as command-line arguments to the script.
@@ -459,16 +524,17 @@ def parse_args():
         "--train_batch_size",
         type=int,
         default=4,
-        help="batch size per dp rank, for tensor parallelism degree 8 with pipeline parallel degree 1 this means 8*this batch size per node",
+        help="batch size per dp rank, for tensor parallelism degree 8 with pipeline parallel degree 1 this means 8*this batch size per node",  # pylint: disable=line-too-long
     )
     opt_grp.add_argument("--val_batch_size", type=int, default=4)
-    opt_grp.add_argument("--max_steps", type=int, default=5000)
+    opt_grp.add_argument("--max_steps", "--max_training_steps", type=int, default=5000)
     opt_grp.add_argument("--seed", type=int, default=12345)
     opt_grp.add_argument("--same_seed", type=int, default=0)
     opt_grp.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
     opt_grp.add_argument("--fp16", default=0, type=int, help="automatic mixed precision training")
     opt_grp.add_argument("--bf16", default=0, type=int, help="automatic mixed precision training")
     opt_grp.add_argument("--sharded_data_parallel_degree", default=1, type=int)
+    opt_grp.add_argument("--ddp_dist_backend", type=str, default="auto")
     opt_grp.add_argument("--grad_clip", default=1.0, type=float, help="gradient clipping")
     opt_grp.add_argument("--weight_decay", default=0.01, type=float, help="weight decay")
     opt_grp.add_argument(
@@ -486,10 +552,22 @@ def parse_args():
     parser.add_argument(
         "--logging_freq", type=int, default=1, help="number of iterations between logging"
     )
+    parser.add_argument(
+        "--log_param_norms",
+        type=int,
+        default=0,
+        help="to log param norms with logging_freq frequency, currently works only for sharded data parallel jobs",  # pylint: disable=line-too-long
+    )
+    parser.add_argument(
+        "--log_reduced_training_loss",
+        type=int,
+        default=0,
+        help="to log training loss after reducing across all data parallel ranks with logging_freq frequency",  # pylint: disable=line-too-long
+    )
 
     # I/O
     io_grp = parser.add_argument_group(title="io", description="location for input and output")
-    io_grp.add_argument("--use_wiki_data", type=int, default=0, help="use wiki corpus data for training")
+    io_grp.add_argument("--use_bert_data", type=int, default=0, help="use bert data for training")
     io_grp.add_argument("--zipped_data", type=int, default=1, help="input data is zipped files")
     io_grp.add_argument(
         "--epochs", type=int, default=3, help="times of iterating over the training dataset"
@@ -499,13 +577,13 @@ def parse_args():
         "--checkpoint-dir",
         type=str,
         default="/opt/ml/checkpoints",
-        help="Saves partial checkpoints (model, optimizer) to this dir, and loads latest checkpoint from this if load_partial is specified.",
+        help="Saves partial checkpoints (model, optimizer) to this dir, and loads latest checkpoint from this if load_partial is specified.",  # pylint: disable=line-too-long
     )
     io_grp.add_argument(
         "--model-dir",
         type=str,
         default=os.environ["SM_MODEL_DIR"],
-        help="Saves full model for inference to this dir. Also used if load_full is given to load the model. Note the lack of optimizer state here.",
+        help="Saves full model for inference to this dir. Also used if load_full is given to load the model. Note the lack of optimizer state here.",  # pylint: disable=line-too-long
     )
     io_grp.add_argument("--training-dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
     io_grp.add_argument("--test-dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
@@ -513,7 +591,7 @@ def parse_args():
         "--parallel_proc_data_processing",
         type=int,
         default=0,
-        help="Load data in parallel with a different process. At any point a process can have two files in memory. With tensor parallelism, each of the 8 processes on an instance will then have 2 files in memory. Depending on file sizes this may or may not be feasible. With pipeline parallelism this was not a problem as only 1 rank on an instance loaded data.",
+        help="Load data in parallel with a different process. At any point a process can have two files in memory. With tensor parallelism, each of the 8 processes on an instance will then have 2 files in memory. Depending on file sizes this may or may not be feasible. With pipeline parallelism this was not a problem as only 1 rank on an instance loaded data.",  # pylint: disable=line-too-long
     )
     io_grp.add_argument(
         "--save_final_full_model",
@@ -527,23 +605,39 @@ def parse_args():
         "--logits_output", type=str, default="", help="Path to save logits and loss"
     )
     io_grp.add_argument("--prescaled_batch", type=int, default=1, help="use prescaled batch")
-
     # configure model size
     model_grp = parser.add_argument_group(
         title="model", description="arguments to describe model configuration"
     )
+    model_grp.add_argument(
+        "--fine_tune",
+        type=int,
+        default=0,
+        help="Fine-tune model from checkpoint or pretrained model",
+    )
+    model_grp.add_argument("--model_name", type=str, default="", help="HF model name")
     model_grp.add_argument("--max_context_width", type=int, default=1024)
     model_grp.add_argument("--vocab_size", type=int, default=50264)
     model_grp.add_argument("--hidden_width", type=int, default=768)
+    model_grp.add_argument("--intermediate_size", type=int, default=2048)
     model_grp.add_argument("--num_layers", type=int, default=12)
     model_grp.add_argument("--num_heads", type=int, default=12)
     model_grp.add_argument("--resid_pdrop", type=float, default=0.1)
     model_grp.add_argument("--embd_pdrop", type=float, default=0.1)
     model_grp.add_argument("--attn_pdrop", type=float, default=0.1)
+    model_grp.add_argument("--alibi", type=float, default=0)
     model_grp.add_argument("--summary_first_pdrop", type=float, default=0.1)
     model_grp.add_argument("--use_adamw", type=int, default=0, help="Use adamw optimizer")
-    model_grp.add_argument("--use_distributed_transformer", type=int, default=1, help="Use distributed transformer")
-    model_grp.add_argument("--checkpoint_sublayers", type=int, default=0, help="Apply activation checkpointing to submodules of each transformer layer")
+    model_grp.add_argument(
+        "--use_distributed_transformer", type=int, default=1, help="Use distributed transformer"
+    )
+    model_grp.add_argument(
+        "--checkpoint_sublayers",
+        type=int,
+        default=0,
+        help="Apply activation checkpointing to submodules of each transformer layer",
+    )
+    model_grp.add_argument("--initializer_range", type=float, default=0.02)
 
     smp_grp = parser.add_argument_group(title="smp", description="smp")
     smp_grp.add_argument("--tensor_parallel_degree", type=int, default=1)
@@ -558,15 +652,30 @@ def parse_args():
     smp_grp.add_argument("--static_mode", type=int, default=0)
     smp_grp.add_argument("--delayed_param", type=int, default=0)
     smp_grp.add_argument("--same_partition_load", type=int, default=0)
-    smp_grp.add_argument("--attention_in_fp32", type=int, default=0)
+    smp_grp.add_argument(
+        "--attention_in_fp32",
+        type=int,
+        default=0,
+        help="When using FP16 and if the activations overflow, doing the attention computation in fp32 may help. But note that this can substantially increase memory usage and reduce performance. We recommend using bf16 instead which is more numerically stable and would not need this.",  # pylint: disable=line-too-long
+    )
+    smp_grp.add_argument(
+        "--residual_addition_in_fp32",
+        type=int,
+        default=0,
+        help="When using FP16 and if the activations overflow, adding residuals in fp32 may help. But note that this can substantially increase memory usage and reduce performance. We recommend using bf16 instead which is more numerically stable and would not need this.",  # pylint: disable=line-too-long
+    )
     smp_grp.add_argument("--placement_strategy", type=str, default="cluster")
     smp_grp.add_argument("--activation_loading_horizon", type=int, default=4)
     smp_grp.add_argument("--skip_tracing", type=int, default=0)
-    smp_grp.add_argument("--query_key_layer_scaling", type=int, default=1)
+    smp_grp.add_argument("--query_key_layer_scaling", type=int, default=0)
     smp_grp.add_argument("--fused_softmax", type=int, default=1)
+    smp_grp.add_argument("--flash_attention", type=int, default=1)
     smp_grp.add_argument("--fused_dropout", type=int, default=0)
     smp_grp.add_argument("--fused_bias_gelu", type=int, default=1)
     smp_grp.add_argument("--gradient_accumulation", type=int, default=1)
+    smp_grp.add_argument("--model_type", type=str, default="gpt2")
+    smp_grp.add_argument("--rotary_pct", type=float, default=0.25)
+    smp_grp.add_argument("--rotary_emb_base", type=int, default=10000)
 
     parser.add_argument(
         "--num_kept_checkpoints",
@@ -620,7 +729,7 @@ def parse_args():
         "--gather_if_shard",
         type=int,
         default=1,
-        help="When sharding opt states is enabled, gather the opt checkpoint to rdp rank 0 during saving",
+        help="When sharding opt states is enabled, gather the opt checkpoint to rdp rank 0 during saving",  # pylint: disable=line-too-long
     )
     parser.add_argument(
         "--clean_cache",
@@ -679,24 +788,49 @@ def parse_args():
     args, _ = parser.parse_known_args()
     return args
 
+
 def compute_num_params(model):
+    """Get num params."""
     num_params = 0
     seen = set()
-    for p in model.parameters():
+    for p in model.parameters():  # pylint: disable=invalid-name
         if p not in seen:
             seen.add(p)
             if hasattr(p, "ds_shape"):
-                num_params += np.prod(p.ds_shape) 
+                num_params += np.prod(p.ds_shape)
             else:
                 num_params += np.prod(p.size())
-    
-    return num_params 
 
-def main():
+    return num_params
+
+
+def compute_tflops(throughput, num_params, num_gpus, seq_len, log = False):
+    """Compute TFLOPs."""
+    tflops = 8 * throughput * num_params / num_gpus * seq_len * 1e-12
+    if log and smp.rank() == 0:
+        logging.info("Compute tflops: (%s, %s, %s, %s) ==> %s.",
+                     throughput, num_params, num_gpus, seq_len, tflops)
+
+    return tflops
+
+
+def _show_env_vars(rank: Optional[int] = 0):
+    env_var = os.environ
+    if rank is None or smp.rank() == rank:
+        logging.info("Env variables (len = %d):", len(env_var))
+
+        count = 0
+        for key, value in sorted(env_var.items()):
+            logging.info("  env [%03d/%03d] %-20s: `%s`", count, len(env_var), key, value)
+            count += 1
+
+
+def main():  # pylint: disable=too-many-branches,too-many-locals,too-many-statements
+    """Main function to train GPT."""
     args = parse_args()
 
     if args.partition_assignment != "" and args.manual_partition == 0:
-        print("[Warning] partition_assignment is set, enable manual_partition")
+        logging.warning("Partition_assignment is set, enable manual_partition.")
         args.manual_partition = 1
 
     # any value here is overriden by the config set in notebook when launching the sagemaker job
@@ -715,64 +849,61 @@ def main():
         "placement_strategy": args.placement_strategy,
         "activation_loading_horizon": args.activation_loading_horizon,
         "skip_tracing": args.skip_tracing > 0,
-        "auto_partition": False if args.manual_partition else True,
+        "auto_partition": not args.manual_partition,
         "default_partition": 0,
         "static_mode": args.static_mode > 0,
         "fast_mode": args.fast_mode > 0,
         "sharded_data_parallel_degree": args.sharded_data_parallel_degree,
+        "ddp_dist_backend": args.ddp_dist_backend,
+        "sdp_hierarchical_allgather": False,
+        "sdp_gradient_clipping": args.grad_clip,
     }
     if args.active_microbatches is not None:
         smp_config["active_microbatches"] = args.active_microbatches
-
+    if args.log_param_norms and args.use_distributed_transformer == 1:
+        logging.warning(
+            "Script currently doesn't support logging param norms when using distributed transformer, disabling log_param_norms"  # pylint: disable=line-too-long
+        )
     smp.init(smp_config)
 
+    _show_env_vars(0)
+
     if smp.rank() == 0:
-        print("Arguments:", args.__dict__)
-        print(f"Transformers version: {transformers.__version__}")
-        print(f"smdistributed.modelparallel version: {smdistributed.modelparallel.__version__}")
-        print(f"smdistributed config: {smp_config}")
+        logging.info("Arguments: %s", args.__dict__)
+        logging.info("Transformers version: %s", transformers.__version__)
+        logging.info(
+            "smdistributed.modelparallel version: %s", smdistributed.modelparallel.__version__
+        )
+        logging.info("smdistributed config: %s", smp_config)
 
     if args.save_final_full_model and smp.rank() == 0:
-        print(
-            f"[Warning] Note that save_final_full_model only saves the final model at the end of all steps. It does not save optimizer state. Optimizer state is only saved with partial models which are saved at checkpointing_freq during training. If you want to restart training you need partial checkpoints."
+        logging.warning(
+            "Note that save_final_full_model only saves the final model at the end "
+            "of all steps. It does not save optimizer state. Optimizer state is only "
+            "saved with partial models which are saved at checkpointing_freq during "
+            "training. If you want to restart training you need partial checkpoints."
         )
 
     if args.partition_assignment != "":
         partition_assignment = args.partition_assignment.split(",")
-        assert (
-            len(partition_assignment) == smp.pp_size()
-        ), f"partition_assignment must have the same size as pipeline parallel degree, but getting {len(partition_assignment)} vs {smp.pp_size()}"
-
-    model_config = GPT2Config(
-        vocab_size=args.vocab_size,
-        n_positions=args.max_context_width,
-        n_embd=args.hidden_width,
-        n_layer=args.num_layers,
-        n_head=args.num_heads,
-        n_inner=None,
-        activation_function="gelu_new",
-        resid_pdrop=args.resid_pdrop,
-        embd_pdrop=args.embd_pdrop,
-        attn_pdrop=args.attn_pdrop,
-        layer_norm_epsilon=1e-05,
-        initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=args.summary_first_pdrop,
-        # gradient_checkpointing=args.gradient_checkpointing > 0,
-        use_cache=False,
-        bos_token_id=50256,
-        eos_token_id=50256,
-        return_dict=True,
+        msg = (
+            f"partition_assignment must have the same size as pipeline parallel degree, "
+            f"but getting {len(partition_assignment)} vs {smp.pp_size()}"
+        )
+        logging.fatal("Will fail with: %s.", msg)
+        raise AssertionError(msg)
+
+    model_config, args = model_config_lib.get_model_config_from_args(
+        args.model_type, args.model_name, args, log=(smp.rank() == 0)
     )
 
     # the following improves start-up time by skipping proper initialization
     # of weights in the original model. this is not a problem because DistributedModel
-    # will override those weights anyway when we use distributed transformer. 
+    # will override those weights anyway when we use distributed transformer.
     if args.use_distributed_transformer > 0:
-        from transformers.modeling_utils import PreTrainedModel
+        from transformers.modeling_utils import (  # pylint: disable=import-error,import-outside-toplevel
+            PreTrainedModel,
+        )
 
         PreTrainedModel.init_weights = lambda x: None
 
@@ -783,34 +914,54 @@ def main():
 
     if args.fp16 and args.bf16:
         raise ValueError("FP16 and BF16 cannot be simultaneously enabled.")
-    elif args.fp16:
-        dtype = torch.float16
+
+    if args.fp16:
+        dtype = torch.float16  # pylint: disable=no-member
     elif args.bf16:
-        dtype = torch.bfloat16
+        dtype = torch.bfloat16  # pylint: disable=no-member
     else:
-        dtype = torch.get_default_dtype()
+        dtype = torch.get_default_dtype()  # pylint: disable=no-member
 
+    if args.fine_tune > 0 and args.delayed_param > 0 and smp.rank() == 0:
+        pretrained_model = AutoModelForCausalLM.from_pretrained(
+            args.model_name or args.model_dir
+        )
+        model_state_dict = pretrained_model.state_dict()
+        path = os.path.join(args.model_dir, "fullmodel.pt")
+        torch.save(model_state_dict, path)
+    smp.barrier()
+
+    # About zero_init:
+    # we only want to init with zero for actual model for training,
+    # in disttf case it's used in DistModel wrapper. for others we don't need to set zero init
+    # This is needed only to param_id_to_offset
     with smp.model_creation(
         tensor_parallelism=smp.tp_size() > 1 or args.use_distributed_transformer > 0,
+        zero_init=args.use_distributed_transformer == 0,
         dtype=dtype,
+        distribute_embedding=args.sharded_data_parallel_degree > 1 and smp.tp_size() > 1,
+        use_alibi=args.alibi > 0,
         attention_in_fp32=args.attention_in_fp32 > 0,
+        fp32_residual_addition=args.residual_addition_in_fp32 > 0,
         query_key_layer_scaling=args.query_key_layer_scaling > 0 and args.bf16 < 1,
         fused_softmax=args.fused_softmax > 0,
         fused_dropout=args.fused_dropout > 0,
         fused_bias_gelu=args.fused_bias_gelu > 0,
-        ):
+        flash_attention=args.flash_attention > 0,
+    ):
+        if args.fine_tune > 0 and args.delayed_param == 0:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.model_name or args.model_dir
+            )
+        else:
             model = AutoModelForCausalLM.from_config(model_config)
+
     if args.enable_memory_profiling > 0:
         memory_status_cpu(msg="after model creation")
 
-    num_params = compute_num_params(model)
-    if smp.rank() == 0:
-        print(f"# total parameters: {num_params}")
-
     # smdistributed: Set the device to the GPU ID used by the current process.
     # Input tensors should be transferred to this device.
     torch.cuda.set_device(smp.local_rank())
-    device = torch.device("cuda")
 
     if not args.same_seed:
         # Set seed by tp_rank to prevent weights from being the same on different tp_ranks
@@ -822,35 +973,58 @@ def main():
     # the model provided for DistributedModel class instantiation.
     if args.enable_memory_profiling > 0:
         memory_status_cpu(msg="before dist model creation")
-    model = smp.DistributedModel(model, trace_device="gpu", backward_passes_per_step=args.gradient_accumulation)
+
+    model = smp.DistributedModel(
+        model, trace_device="gpu", backward_passes_per_step=args.gradient_accumulation
+    )
+
     if args.enable_memory_profiling > 0:
         memory_status_cpu(msg="after dist model creation")
+    m = model.get_module()  # pylint: disable=invalid-name
+
+    num_params = compute_num_params(m)
+    if smp.rank() == 0:
+        logging.info("# total parameters: %s", num_params)
 
-    m = model.get_module()
     if args.use_distributed_transformer > 0:
         transformer_layers = m.transformer.seq_layers
     else:
-        transformer_layers = m.transformer.h
+        if args.model_type in ["gpt2", "bloom"]:
+            transformer_layers = m.transformer.h
+        elif args.model_type == "gpt_neox":
+            transformer_layers = m.gpt_neox.layers
 
     if args.manual_partition:
-        print(f"Manual partition enabled")
+        logging.debug("Manual partition enabled")
         if args.partition_assignment != "":
-            get_num_layers = lambda x: int(partition_assignment[x])
-            total_layers = sum([get_num_layers(pp_rank) for pp_rank in range(smp.pp_size())])
-            assert (
-                total_layers == args.num_layers
-            ), f"partition_assignment must have the same total transformer layers as model, but getting {total_layers} vs {args.num_layers}"
-        else:
-            # evenly distribute layers across all partitions
-            div, rem = divmod(args.num_layers, smp.pp_size())
-            get_num_layers = lambda x: (div + 1 if x >= smp.pp_size() - rem else div)
+            get_num_layers = lambda x: int(  # pylint: disable=unnecessary-lambda-assignment
+                partition_assignment[x]
+            )
+            total_layers = sum(get_num_layers(pp_rank) for pp_rank in range(smp.pp_size()))
+
+            msg = (
+                f"partition_assignment must have the same total transformer layers as model, "
+                f"but getting {total_layers} vs {args.num_layers}"
+            )
+            logging.fatal("Will fail with: %s.", msg)
+            raise AssertionError(msg)
+
+        # evenly distribute layers across all partitions
+        div, rem = divmod(args.num_layers, smp.pp_size())
+        get_num_layers = lambda x: (  # pylint: disable=unnecessary-lambda-assignment
+            div + 1 if x >= smp.pp_size() - rem else div
+        )
+
         assignments = []
+        # (TODO) This is required for 175B otherwise a hang for partition "8,17,17,18,18,18"
+        # Need further investigation
+        # for pp_rank in reversed(range(smp.pp_size())):
         for pp_rank in range(smp.pp_size()):
-            nl = get_num_layers(pp_rank)
-            print(f"{nl} layers assigned to partition {pp_rank}")
+            nl = get_num_layers(pp_rank)  # pylint: disable=invalid-name
+            logging.debug("%s layers assigned to partition %d", nl, pp_rank)
             assignments += [pp_rank for _ in range(nl)]
 
-        for i, c in enumerate(transformer_layers.children()):
+        for i, c in enumerate(transformer_layers.children()):  # pylint: disable=invalid-name
             smp.set_partition(c, assignments[i])
 
     param_groups = get_param_groups_by_weight_decay(m)
@@ -864,41 +1038,65 @@ def main():
             param_groups, betas=(args.beta1, args.beta2), lr=args.lr, weight_decay=args.weight_decay
         )
 
-    if args.activation_checkpointing:
+    if args.activation_checkpointing:  # pylint: disable=too-many-nested-blocks
         if args.use_distributed_transformer or smp.tp_size() > 1:
             if args.checkpoint_sublayers:
-                for c in transformer_layers.children():
+                for c in transformer_layers.children():  # pylint: disable=invalid-name
                     smp.set_activation_checkpointing(c.attention)
                     smp.set_activation_checkpointing(c.output)
             else:
-                smp.set_activation_checkpointing(transformer_layers, strategy=args.activation_strategy)
+                smp.set_activation_checkpointing(
+                    transformer_layers, strategy=args.activation_strategy
+                )
         else:
-            for c in transformer_layers.children():
+            for c in transformer_layers.children():  # pylint: disable=invalid-name
                 if args.checkpoint_sublayers:
-                    smp.set_activation_checkpointing(c.attn)
-                    smp.set_activation_checkpointing(c.mlp)
+                    if args.model_type == "gpt2":
+                        smp.set_activation_checkpointing(c.attn)
+                        smp.set_activation_checkpointing(c.mlp)
+                    elif args.model_type in ["gpt_neox", "bloom"]:
+                        if args.model_type == "gpt_neox":
+                            smp.set_activation_checkpointing(c.attention)
+                        elif args.model_type == "bloom":
+                            smp.set_activation_checkpointing(c.self_attention)
+                        smp.set_activation_checkpointing(c.input_layernorm)
+                        smp.set_activation_checkpointing(c.post_attention_layernorm)
+                        smp.set_activation_checkpointing(c.mlp)
                 else:
                     smp.set_activation_checkpointing(c)
 
+    if args.sharded_data_parallel_degree > 1 and args.use_distributed_transformer == 0:
+        param_id_to_offset = build_param_id_to_offset(param_groups)
+
     optimizer = smp.DistributedOptimizer(
-        optimizer, 
-        static_loss_scale=None, 
+        optimizer,
+        static_loss_scale=None,
         dynamic_loss_scale=True,
         dynamic_loss_args={"scale_window": 1000, "min_scale": 1, "delayed_shift": 2},
-        )
+    )
+
+    if args.fine_tune > 0 and args.delayed_param > 0:
+        smp.resume_from_checkpoint(args.model_dir, tag="fullmodel.pt", partial=False)
+
+    if args.sharded_data_parallel_degree > 1 and args.use_distributed_transformer == 0:
+        param_id_to_buffer = build_param_id_to_buffer(optimizer, param_id_to_offset)
+    else:
+        param_id_to_buffer = None
+
     lr_scheduler = get_learning_rate_scheduler(optimizer, args)
 
     if args.enable_memory_profiling > 0:
         model.register_post_partition_hook(
-            lambda model, optimizer: memory_status(msg="After_partition")
+            lambda model, optimizer: memory_status(msg="After partition")
         )
 
     # load after wrapping model and optimizer with smp Distributed...
     if args.load_full or args.load_partial:
         if args.load_partial and args.load_full:
-            print(
-                "Since both --load_partial and --load_full set, will try to load from full checkpoint."
-                "If the intention is to load from partial checkpoint, please don't set --load_full"
+            logging.info(
+                "Since both --load_partial and --load_full set, will try to load from full "
+                "checkpoint. If the intention is to load from partial checkpoint, please don't set "
+                "--load_full"
             )
         partial = not args.load_full
         path = args.checkpoint_dir if partial else args.model_dir
@@ -914,6 +1112,10 @@ def main():
         start_train_path_index = 0
         start_batch_index = 0
 
+    # Add emty cache to clear memory when loaded with partial checkpointing
+    # for SDPTP and GPT NeoX
+    torch.cuda.empty_cache()
+
     start = time.time()
     total_steps, throughput, loss = train(
         model,
@@ -925,17 +1127,28 @@ def main():
         num_params,
         total_steps,
         args,
+        param_id_to_buffer,
     )
     time_to_train = time.time() - start
     if args.ci:
-        print(f"[SMP_METRIC]__GPT2__Time_to_train__{time_to_train}")
-        print(f"[SMP_METRIC]__GPT2__samples/second__{throughput}")
-        print(f"[SMP_METRIC]__GPT2__Loss__{loss}")
+        logging.info("[SMP_METRIC]__GPT2__Time_to_train__%s", time_to_train)
+        logging.info("[SMP_METRIC]__GPT2__samples/second__%s", throughput)
+        logging.info("[SMP_METRIC]__GPT2__Loss__%s", loss)
         if not args.load_partial and not args.load_full:
-            assert time_to_train < args.time_to_train
-            assert throughput > args.throughput
-            if args.loss:
-                assert loss < args.loss
+            if time_to_train >= args.time_to_train:
+                msg = f"Time to train ({time_to_train}) >= threshold ({args.time_to_train})"
+                logging.fatal("Will fail with: %s.", msg)
+                raise AssertionError(msg)
+
+            if throughput <= args.throughput:
+                msg = f"Throughput ({throughput}) >= threshold ({args.throughput})"
+                logging.fatal("Will fail with: %s.", msg)
+                raise AssertionError(msg)
+
+            if args.loss and loss >= args.loss:
+                msg = f"Loss ({loss}) >= threshold ({args.loss})"
+                logging.fatal("Will fail with: %s.", msg)
+                raise AssertionError(msg)
 
     if args.save_final_full_model:
         # saves full model at the end
@@ -945,29 +1158,24 @@ def main():
             "total_steps": total_steps,
             "model_config": model_config,
         }
-        if args.sharded_data_parallel_degree > 1:
-            # When sharded_data_parallel_degree > 1, saving full model is not supported, saving partial instead
-            # To get the full model, one can use the following API
-            # > from sharded_data_parallel_checkpoint import get_full_state_dict_from_sharded_data_parallel_checkpoint
-            # > full_model = get_full_state_dict_from_sharded_data_parallel_checkpoint(args.model_dir, tag=f"sharded_data_parallel_final_full_{num_params}", dtype=torch.float32)
-            # > if args.use_distributed_transformer > 0: # translate the state_dict to hf format if distributed transformer is used
-            # >     full_model = smp.nn.huggingface.gpt2.translate_state_dict_to_hf_gpt2(full_model, max_seq_len=args.max_context_width)
-            # Note: the shared parameter will not be reflected so during loading you might need to load with strict=False
-            user_content["buffer_names"] = get_buffer_names(model)
-            user_content["param_shapes"] = get_param_shapes(model, optimizer)
-            smp.save_checkpoint(args.model_dir,
-                tag=f"sharded_data_parallel_final_full_{num_params}",
-                partial=True,
-                model=model,
-                optimizer=optimizer,
-                user_content=user_content)
-        else:
-            smp.save_checkpoint(args.model_dir, tag="fullmodel.pt", partial=False, model=model, user_content=user_content)
+        smp.save_checkpoint(
+            args.model_dir,
+            tag="fullmodel.pt",
+            partial=False,
+            model=model,
+            user_content=user_content,
+        )
 
     smp.barrier()
     if smp.rank() == 0:
-        print("SMP training finished successfully")
+        logging.info("SMP training finished successfully")
 
 
 if __name__ == "__main__":
+    logging.basicConfig(
+        format="%(asctime)s,%(msecs)03d %(levelname)-8s " "[%(filename)s:%(lineno)d] %(message)s",
+        datefmt="%Y-%m-%d:%H:%M:%S",
+        level=logging.INFO,
+    )
+
     main()