diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 00000000..c8d71c96
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = python3 -msphinx
+SPHINXPROJ = sagemaker
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/doc/_static/custom.css b/doc/_static/custom.css
new file mode 100644
index 00000000..b4bfb4cc
--- /dev/null
+++ b/doc/_static/custom.css
@@ -0,0 +1,61 @@
+/* Custom styles for SageMaker HyperPod documentation */
+
+/* Adjust logo size and alignment */
+.navbar-brand img {
+ max-height: 40px;
+ width: auto;
+ margin-right: 10px;
+ vertical-align: middle;
+}
+
+.navbar-brand .title {
+ font-weight: 800;
+ color: #111827;
+}
+
+/* Ensure logo container doesn't force wrapping */
+.navbar-brand-box {
+ width: auto;
+ flex-shrink: 0;
+}
+
+/* Header styling */
+header {
+ background-color: white;
+
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
+ position: sticky;
+ top: 0;
+ z-index: 50;
+}
+
+h1 {
+ font-size: 1.875rem;
+ font-weight: 700;
+ color: #111827;
+}
+
+h2 {
+ font-size: 1.5rem;
+ font-weight: 700;
+ color: #111827;
+}
+
+h3 {
+ font-size: 1.25rem;
+ font-weight: 500;
+ color: #111827;
+}
+
+p {
+ font-size: 1.0rem;
+ color: #4b5563;
+}
+
+html[data-theme="dark"] .navbar-brand .title {
+ color: #f8fafc !important;
+}
+
+html[data-theme="dark"] p {
+ color: #d1d5db !important;
+}
diff --git a/doc/_static/image.png b/doc/_static/image.png
new file mode 100644
index 00000000..c90c4cd2
Binary files /dev/null and b/doc/_static/image.png differ
diff --git a/doc/_static/image_dark.png b/doc/_static/image_dark.png
new file mode 100644
index 00000000..ebcadd94
Binary files /dev/null and b/doc/_static/image_dark.png differ
diff --git a/doc/_static/image_light.svg b/doc/_static/image_light.svg
new file mode 100644
index 00000000..2aed204d
--- /dev/null
+++ b/doc/_static/image_light.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/doc/_static/search_accessories.css b/doc/_static/search_accessories.css
new file mode 100644
index 00000000..c7e09e1f
--- /dev/null
+++ b/doc/_static/search_accessories.css
@@ -0,0 +1,29 @@
+.example-badge {
+ background-color: #c63340;
+ color: white;
+ padding: 0.25rem 0.5rem;
+ text-align: center;
+ border-radius: 5px;
+ font-size: 0.8rem;
+ display: inline-block;
+}
+
+.aws-doc-badge {
+ background-color: #e18b50;
+ color: white;
+ padding: 0.25rem 0.5rem;
+ text-align: center;
+ border-radius: 5px;
+ font-size: 0.8rem;
+ display: inline-block;
+}
+
+.sdk-doc-badge {
+ background-color: #4c968f;
+ color: white;
+ padding: 0.25rem 0.5rem;
+ text-align: center;
+ border-radius: 5px;
+ font-size: 0.8rem;
+ display: inline-block;
+}
\ No newline at end of file
diff --git a/doc/advanced_resources.md b/doc/advanced_resources.md
new file mode 100644
index 00000000..d3e2cc2c
--- /dev/null
+++ b/doc/advanced_resources.md
@@ -0,0 +1,54 @@
+(advanced_resources)=
+
+# Advanced Resources
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+
+examples
+AWS SageMaker HyperPod Docs
+HyperPod Developer Guide
+SageMaker HyperPod Workshop
+
+```
+
+## Advanced Resources
+
+::::{grid} 1 2 2 2
+:gutter: 3
+
+:::{grid-item-card} Github
+:link: examples
+:link-type: ref
+:class-card: sd-border-secondary
+
+**Example Notebooks** - Ready-to-use implementation guides
+:::
+
+:::{grid-item-card} AWS SageMaker HyperPod Docs
+:link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html
+:link-type: url
+:class-card: sd-border-secondary
+
+**HyperPod Documentation** - Know more about HyperPod
+:::
+
+:::{grid-item-card} HyperPod Developer Guide
+:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US
+:link-type: url
+:class-card: sd-border-secondary
+
+**Developer Guide** - Refer to this practical development guide
+:::
+
+:::{grid-item-card} SageMaker HyperPod Workshop
+:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US
+:link-type: url
+:class-card: sd-border-secondary
+
+**Practical Guide** - Refer to the workshop for detailed follow-through steps
+:::
+
+
+::::
diff --git a/doc/api/api_index.rst b/doc/api/api_index.rst
new file mode 100644
index 00000000..b5d37197
--- /dev/null
+++ b/doc/api/api_index.rst
@@ -0,0 +1,33 @@
+#############
+SDK Reference
+#############
+
+.. toctree::
+ :hidden:
+ :maxdepth: 2
+
+ training/hyperpod_pytorch_job
+ inference/hp_endpoint
+
+Complete reference for the SageMaker HyperPod SDK.
+
+.. container::
+
+ .. grid:: 1 1 3 3
+ :gutter: 3
+
+ .. grid-item-card:: Training SDK
+ :link: training/hyperpod_pytorch_job
+ :link-type: doc
+ :class-card: sd-border-secondary
+
+ Training SDK classes, methods and parameters.
+
+ .. grid-item-card:: Inference SDK
+ :link: inference/hp_endpoint
+ :link-type: doc
+ :class-card: sd-border-secondary
+
+ Inference SDK classes, methods and parameters.
+
+
diff --git a/doc/api/inference/hp_endpoint.rst b/doc/api/inference/hp_endpoint.rst
new file mode 100644
index 00000000..53afbad0
--- /dev/null
+++ b/doc/api/inference/hp_endpoint.rst
@@ -0,0 +1,45 @@
+Inference
+===========
+
+* `HPEndpointBase`_
+* `HPEndpoint`_
+* `HPJumpStartEndpoint`_
+* `HPEndpoint Configs`_
+
+
+HPEndpointBase
+-------------------
+
+.. automodule:: sagemaker.hyperpod.inference.hp_endpoint_base
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+HPEndpoint
+-------------------
+
+.. automodule:: sagemaker.hyperpod.inference.hp_endpoint
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+HPJumpStartEndpoint
+---------------------
+
+.. automodule:: sagemaker.hyperpod.inference.hp_jumpstart_endpoint
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+HPEndpoint Configs
+-------------------
+
+.. automodule:: sagemaker.hyperpod.inference.config.hp_endpoint_config
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/api/metadata.rst b/doc/api/metadata.rst
new file mode 100644
index 00000000..6ae5472d
--- /dev/null
+++ b/doc/api/metadata.rst
@@ -0,0 +1,7 @@
+Metadata
+------------
+
+.. automodule:: sagemaker.hyperpod.common.config.metadata
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/api/training/hyperpod_pytorch_job.rst b/doc/api/training/hyperpod_pytorch_job.rst
new file mode 100644
index 00000000..6a33dddd
--- /dev/null
+++ b/doc/api/training/hyperpod_pytorch_job.rst
@@ -0,0 +1,24 @@
+Training
+===========
+
+* `HyperPodPytorchJob`_
+* `HyperPodPytorchJob Configs`_
+
+
+HyperPodPytorchJob
+-------------------
+
+.. automodule:: sagemaker.hyperpod.training.hyperpod_pytorch_job
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+HyperPodPytorchJob Configs
+---------------------------
+
+.. automodule:: sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
diff --git a/doc/cli_inference.md b/doc/cli_inference.md
new file mode 100644
index 00000000..1c79a706
--- /dev/null
+++ b/doc/cli_inference.md
@@ -0,0 +1,344 @@
+(cli_inference)=
+
+# Inference
+
+Complete reference for SageMaker HyperPod inference parameters and configuration options.
+
+* [Create JumpStart Endpoint](#hyp-create-hyp-jumpstart-endpoint)
+* [Create Custom Endpoint](#hyp-create-hyp-custom-endpoint)
+
+* [List JumpStart Endpoints](#hyp-list-hyp-jumpstart-endpoint)
+* [List Custom Endpoints](#hyp-list-hyp-custom-endpoint)
+* [Describe JumpStart Endpoint](#hyp-describe-hyp-jumpstart-endpoint)
+* [Describe Custom Endpoint](#hyp-describe-hyp-custom-endpoint)
+* [Invoke JumpStart Endpoint](#hyp-invoke-hyp-jumpstart-endpoint)
+* [Invoke Custom Endpoint](#hyp-invoke-hyp-custom-endpoint)
+* [Delete JumpStart Endpoint](#hyp-delete-hyp-jumpstart-endpoint)
+* [Delete Custom Endpoint](#hyp-delete-hyp-custom-endpoint)
+
+* [List JumpStart Pods](#hyp-list-pods-hyp-jumpstart-endpoint)
+* [List Custom Pods](#hyp-list-pods-hyp-custom-endpoint)
+* [Get JumpStart Logs](#hyp-get-logs-hyp-jumpstart-endpoint)
+* [Get Custom Logs](#hyp-get-logs-hyp-custom-endpoint)
+* [Get JumpStart Operator Logs](#hyp-get-operator-logs-hyp-jumpstart-endpoint)
+* [Get Custom Operator Logs](#hyp-get-operator-logs-hyp-custom-endpoint)
+
+
+
+## hyp create hyp-jumpstart-endpoint
+
+Deploy pre-trained models from SageMaker JumpStart.
+
+#### Syntax
+
+```bash
+hyp create hyp-jumpstart-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--model-id TEXT`: JumpStart model identifier (1-63 characters, alphanumeric with hyphens)
+- `--instance-type TEXT`: EC2 instance type for inference (must start with "ml.")
+
+#### Optional Parameters
+
+- `--accept-eula BOOLEAN`: Whether model terms of use have been accepted (default: false)
+- `--model-version TEXT`: Semantic version of the model (e.g., "1.0.0", 5-14 characters)
+- `--endpoint-name TEXT`: Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens)
+- `--tls-certificate-output-s3-uri TEXT`: S3 URI to write the TLS certificate (optional)
+
+### hyp create hyp-custom-endpoint
+
+Deploy custom models with your own inference code.
+
+#### Syntax
+
+```bash
+hyp create hyp-custom-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--instance-type TEXT`: EC2 instance type for inference (must start with "ml.")
+- `--model-name TEXT`: Name of model to create on SageMaker (1-63 characters, alphanumeric with hyphens)
+- `--model-source-type TEXT`: Model source type ("s3" or "fsx")
+- `--image-uri TEXT`: Docker image URI for inference
+- `--container-port INTEGER`: Port on which model server listens (1-65535)
+- `--model-volume-mount-name TEXT`: Name of the model volume mount
+
+#### Optional Parameters
+
+- `--endpoint-name TEXT`: Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens)
+- `--env OBJECT`: Environment variables as key-value pairs
+- `--metrics-enabled BOOLEAN`: Enable metrics collection (default: false)
+- `--model-version TEXT`: Version of the model (semantic version format)
+- `--model-location TEXT`: Specific model data location
+- `--prefetch-enabled BOOLEAN`: Whether to pre-fetch model data (default: false)
+- `--tls-certificate-output-s3-uri TEXT`: S3 URI for TLS certificate output
+- `--fsx-dns-name TEXT`: FSx File System DNS Name
+- `--fsx-file-system-id TEXT`: FSx File System ID
+- `--fsx-mount-name TEXT`: FSx File System Mount Name
+- `--s3-bucket-name TEXT`: S3 bucket location
+- `--s3-region TEXT`: S3 bucket region
+- `--model-volume-mount-path TEXT`: Path inside container for model volume (default: "/opt/ml/model")
+- `--resources-limits OBJECT`: Resource limits for the worker
+- `--resources-requests OBJECT`: Resource requests for the worker
+- `--dimensions OBJECT`: CloudWatch Metric dimensions as key-value pairs
+- `--metric-collection-period INTEGER`: Period for CloudWatch query (default: 300)
+- `--metric-collection-start-time INTEGER`: StartTime for CloudWatch query (default: 300)
+- `--metric-name TEXT`: Metric name to query for CloudWatch trigger
+- `--metric-stat TEXT`: Statistics metric for CloudWatch (default: "Average")
+- `--metric-type TEXT`: Type of metric for HPA ("Value" or "Average", default: "Average")
+- `--min-value NUMBER`: Minimum metric value for empty CloudWatch response (default: 0)
+- `--cloud-watch-trigger-name TEXT`: Name for the CloudWatch trigger
+- `--cloud-watch-trigger-namespace TEXT`: AWS CloudWatch namespace for the metric
+- `--target-value NUMBER`: Target value for the CloudWatch metric
+- `--use-cached-metrics BOOLEAN`: Enable caching of metric values (default: true)
+- `--invocation-endpoint TEXT`: Invocation endpoint path (default: "invocations")
+
+## Inference Endpoint Management Commands
+
+Commands for managing inference endpoints.
+
+### hyp list hyp-jumpstart-endpoint
+
+List JumpStart model endpoints.
+
+#### Syntax
+
+```bash
+hyp list hyp-jumpstart-endpoint [OPTIONS]
+```
+
+#### Optional Parameters
+
+- `--namespace TEXT`: Namespace to list endpoints from (default: "default")
+
+### hyp list hyp-custom-endpoint
+
+List custom model endpoints.
+
+#### Syntax
+
+```bash
+hyp list hyp-custom-endpoint [OPTIONS]
+```
+
+#### Optional Parameters
+
+- `--namespace TEXT`: Namespace to list endpoints from (default: "default")
+
+### hyp describe hyp-jumpstart-endpoint
+
+Describe a JumpStart model endpoint.
+
+#### Syntax
+
+```bash
+hyp describe hyp-jumpstart-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--name TEXT`: Name of the endpoint to describe
+
+#### Optional Parameters
+
+- `--namespace TEXT`: Namespace of the endpoint (default: "default")
+- `--full`: Display full JSON output
+
+### hyp describe hyp-custom-endpoint
+
+Describe a custom model endpoint.
+
+#### Syntax
+
+```bash
+hyp describe hyp-custom-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--name TEXT`: Name of the endpoint to describe
+
+#### Optional Parameters
+
+- `--namespace TEXT`: Namespace of the endpoint (default: "default")
+- `--full`: Display full JSON output
+
+### hyp invoke hyp-jumpstart-endpoint
+
+Invoke a JumpStart model endpoint.
+
+#### Syntax
+
+```bash
+hyp invoke hyp-jumpstart-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--endpoint-name TEXT`: Name of the endpoint to invoke
+- `--body TEXT`: Request body (JSON format)
+
+#### Optional Parameters
+
+- `--content-type TEXT`: Content type of the request (default: "application/json")
+
+### hyp invoke hyp-custom-endpoint
+
+Invoke a custom model endpoint.
+
+#### Syntax
+
+```bash
+hyp invoke hyp-custom-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--endpoint-name TEXT`: Name of the endpoint to invoke
+- `--body TEXT`: Request body (JSON format)
+
+#### Optional Parameters
+
+- `--content-type TEXT`: Content type of the request (default: "application/json")
+
+### hyp delete hyp-jumpstart-endpoint
+
+Delete a JumpStart model endpoint.
+
+#### Syntax
+
+```bash
+hyp delete hyp-jumpstart-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--name TEXT`: Name of the endpoint to delete
+
+#### Optional Parameters
+
+- `--namespace TEXT`: Namespace of the endpoint (default: "default")
+
+### hyp delete hyp-custom-endpoint
+
+Delete a custom model endpoint.
+
+#### Syntax
+
+```bash
+hyp delete hyp-custom-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--name TEXT`: Name of the endpoint to delete
+
+#### Optional Parameters
+
+- `--namespace TEXT`: Namespace of the endpoint (default: "default")
+
+### hyp list-pods hyp-jumpstart-endpoint
+
+List pods for JumpStart endpoints.
+
+#### Syntax
+
+```bash
+hyp list-pods hyp-jumpstart-endpoint [OPTIONS]
+```
+
+#### Optional Parameters
+
+- `--namespace TEXT`: Namespace to list pods from (default: "default")
+
+### hyp list-pods hyp-custom-endpoint
+
+List pods for custom endpoints.
+
+#### Syntax
+
+```bash
+hyp list-pods hyp-custom-endpoint [OPTIONS]
+```
+
+#### Optional Parameters
+
+- `--namespace TEXT`: Namespace to list pods from (default: "default")
+
+### hyp get-logs hyp-jumpstart-endpoint
+
+Get logs from JumpStart endpoint pods.
+
+#### Syntax
+
+```bash
+hyp get-logs hyp-jumpstart-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--pod-name TEXT`: Name of the pod to get logs from
+
+#### Optional Parameters
+
+- `--container TEXT`: Container name to get logs from
+- `--namespace TEXT`: Namespace of the pod (default: "default")
+
+### hyp get-logs hyp-custom-endpoint
+
+Get logs from custom endpoint pods.
+
+#### Syntax
+
+```bash
+hyp get-logs hyp-custom-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--pod-name TEXT`: Name of the pod to get logs from
+
+#### Optional Parameters
+
+- `--container TEXT`: Container name to get logs from
+- `--namespace TEXT`: Namespace of the pod (default: "default")
+
+### hyp get-operator-logs hyp-jumpstart-endpoint
+
+Get operator logs for JumpStart endpoints.
+
+#### Syntax
+
+```bash
+hyp get-operator-logs hyp-jumpstart-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--since-hours FLOAT`: Time frame to get logs for (in hours)
+
+### hyp get-operator-logs hyp-custom-endpoint
+
+Get operator logs for custom endpoints.
+
+#### Syntax
+
+```bash
+hyp get-operator-logs hyp-custom-endpoint [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--since-hours FLOAT`: Time frame to get logs for (in hours)
+
+## Parameter Reference
+
+### Common Parameters Across Commands
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `--namespace` | TEXT | Kubernetes namespace | Current context |
+| `--help` | FLAG | Show command help | - |
diff --git a/doc/cli_reference.md b/doc/cli_reference.md
new file mode 100644
index 00000000..744ab4ed
--- /dev/null
+++ b/doc/cli_reference.md
@@ -0,0 +1,36 @@
+(cli_reference)=
+
+# CLI Reference
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+
+cli_training
+cli_inference
+```
+
+Complete reference for the SageMaker HyperPod Command Line Interface.
+
+::::{container}
+::::{grid} 1 1 3 3
+:gutter: 3
+
+:::{grid-item-card} Training CLI
+:link: cli_training
+:link-type: ref
+:class-card: sd-border-secondary
+
+Training CLI commands, options and parameters.
+:::
+
+:::{grid-item-card} Inference CLI
+:link: cli_inference
+:link-type: ref
+:class-card: sd-border-secondary
+
+Inference CLI commands, options and parameters.
+:::
+
+::::
+::::
\ No newline at end of file
diff --git a/doc/cli_training.md b/doc/cli_training.md
new file mode 100644
index 00000000..1d4520b7
--- /dev/null
+++ b/doc/cli_training.md
@@ -0,0 +1,172 @@
+(cli_training)=
+
+
+# Training
+
+Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options.
+
+* [Create PyTorch Job](#hyp-create-hyp-pytorch-job)
+* [List Jobs](#hyp-list-hyp-pytorch-job)
+* [Describe Job](#hyp-describe-hyp-pytorch-job)
+* [Delete Job](#hyp-delete-hyp-pytorch-job)
+* [List Pods](#hyp-list-pods-hyp-pytorch-job)
+* [Get Logs](#hyp-get-logs-hyp-pytorch-job)
+
+
+## hyp create hyp-pytorch-job
+
+Create distributed PyTorch training jobs on SageMaker HyperPod clusters.
+
+### Syntax
+
+```bash
+hyp create hyp-pytorch-job [OPTIONS]
+```
+
+### Required Parameters
+
+- `--job-name TEXT`: Unique name for the training job (1-63 characters, alphanumeric with hyphens)
+- `--image TEXT`: Docker image URI containing your training code
+
+### Optional Parameters
+
+- `--namespace TEXT`: Kubernetes namespace
+- `--command ARRAY`: Command to run in the container (array of strings)
+- `--args ARRAY`: Arguments for the entry script (array of strings)
+- `--environment OBJECT`: Environment variables as key-value pairs
+- `--pull-policy TEXT`: Image pull policy (Always, Never, IfNotPresent)
+- `--instance-type TEXT`: Instance type for training
+- `--node-count INTEGER`: Number of nodes (minimum: 1)
+- `--tasks-per-node INTEGER`: Number of tasks per node (minimum: 1)
+- `--label-selector OBJECT`: Node label selector as key-value pairs
+- `--deep-health-check-passed-nodes-only BOOLEAN`: Schedule pods only on nodes that passed deep health check (default: false)
+- `--scheduler-type TEXT`: Scheduler type
+- `--queue-name TEXT`: Queue name for job scheduling (1-63 characters, alphanumeric with hyphens)
+- `--priority TEXT`: Priority class for job scheduling
+- `--max-retry INTEGER`: Maximum number of job retries (minimum: 0)
+- `--volume ARRAY`: List of volume configurations (Refer [Volume Configuration](#volume-configuration) for detailed parameter info)
+- `--service-account-name TEXT`: Service account name
+
+### Volume Configuration
+
+The `--volume` parameter supports mounting different types of storage to your training containers.
+
+### Volume Syntax
+
+```bash
+--volume name=,type=,mount_path=[,additional_options]
+```
+
+### Volume Types
+
+**hostPath Volume**
+```bash
+--volume name=model-data,type=hostPath,mount_path=/data,path=/host/data
+```
+
+**Persistent Volume Claim (PVC)**
+```bash
+--volume name=training-output,type=pvc,mount_path=/output,claim_name=training-pvc,read_only=false
+```
+
+### Volume Parameters
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `name` | TEXT | Yes | Volume name |
+| `type` | TEXT | Yes | Volume type (`hostPath` or `pvc`) |
+| `mount_path` | TEXT | Yes | Mount path in container |
+| `path` | TEXT | For hostPath | Host path for hostPath volumes |
+| `claim_name` | TEXT | For pvc | PVC claim name for pvc volumes |
+| `read_only` | BOOLEAN | No | Read-only flag for pvc volumes |
+
+## Training Job Management Commands
+
+Commands for managing PyTorch training jobs.
+
+### hyp list hyp-pytorch-job
+
+List all HyperPod PyTorch jobs in a namespace.
+
+#### Syntax
+
+```bash
+hyp list hyp-pytorch-job [OPTIONS]
+```
+
+#### Optional Parameters
+
+- `--namespace, -n TEXT`: Namespace to list jobs from (default: "default")
+
+### hyp describe hyp-pytorch-job
+
+Describe a specific HyperPod PyTorch job.
+
+#### Syntax
+
+```bash
+hyp describe hyp-pytorch-job [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--job-name TEXT`: Name of the job to describe
+
+#### Optional Parameters
+
+- `--namespace, -n TEXT`: Namespace of the job (default: "default")
+
+### hyp delete hyp-pytorch-job
+
+Delete a HyperPod PyTorch job.
+
+#### Syntax
+
+```bash
+hyp delete hyp-pytorch-job [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--job-name TEXT`: Name of the job to delete
+
+#### Optional Parameters
+
+- `--namespace, -n TEXT`: Namespace of the job (default: "default")
+
+### hyp list-pods hyp-pytorch-job
+
+List all pods associated with a PyTorch job.
+
+#### Syntax
+
+```bash
+hyp list-pods hyp-pytorch-job [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--job-name TEXT`: Name of the job to list pods for
+
+#### Optional Parameters
+
+- `--namespace, -n TEXT`: Namespace of the job (default: "default")
+
+### hyp get-logs hyp-pytorch-job
+
+Get logs from a specific pod in a PyTorch job.
+
+#### Syntax
+
+```bash
+hyp get-logs hyp-pytorch-job [OPTIONS]
+```
+
+#### Required Parameters
+
+- `--job-name TEXT`: Name of the job
+- `--pod-name TEXT`: Name of the pod to get logs from
+
+#### Optional Parameters
+
+- `--namespace, -n TEXT`: Namespace of the job (default: "default")
diff --git a/doc/conf.py b/doc/conf.py
index 68bf9c75..cf944cf8 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,48 +1,59 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
"""Sphinx configuration."""
import datetime
import os
import shutil
+import sys
+import re
+import json
+from pathlib import Path
+from typing import Dict, List, Any, Optional
-def run_apidoc(app):
- """Generate doc stubs using sphinx-apidoc."""
- module_dir = os.path.join(app.srcdir, "../src/")
- output_dir = os.path.join(app.srcdir, "_apidoc")
- excludes = []
- # Ensure that any stale apidoc files are cleaned up first.
- if os.path.exists(output_dir):
- shutil.rmtree(output_dir)
+def setup(app):
+ """Register our sphinx hooks."""
- cmd = [
- "--separate",
- "--module-first",
- "--doc-project=API Reference",
- "-o",
- output_dir,
- module_dir,
- ]
- cmd.extend(excludes)
+# Get version from setup.py
+def get_version():
try:
- from sphinx.ext import apidoc # Sphinx >= 1.7
-
- apidoc.main(cmd)
- except ImportError:
- from sphinx import apidoc # Sphinx < 1.7
-
- cmd.insert(0, apidoc.__file__)
- apidoc.main(cmd)
-
-
-def setup(app):
- """Register our sphinx-apidoc hook."""
- app.connect("builder-inited", run_apidoc)
+ # Find the project root directory (where setup.py is located)
+ project_root = Path(__file__).parent.parent
+ setup_py_path = project_root / "setup.py"
+
+ # Read setup.py content
+ with open(setup_py_path, "r") as f:
+ setup_py_content = f.read()
+
+ # Extract version using regex
+ version_match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', setup_py_content)
+ if version_match:
+ return version_match.group(1)
+ else:
+ print("Warning: Could not find version in setup.py")
+ return "unknown"
+ except Exception as e:
+ print(f"Warning: Could not extract version from setup.py: {e}")
+ return "unknown"
# Sphinx configuration below.
project = "SageMaker HyperPod CLI"
+version = get_version()
+release = version
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {"python": ("http://docs.python.org/", None)}
@@ -53,16 +64,93 @@ def setup(app):
"sphinx.ext.napoleon",
"sphinx.ext.todo",
"sphinx.ext.viewcode",
+ "nbsphinx",
+ "myst_nb",
+ "sphinx_design",
+ "sphinx_tabs.tabs",
+ "sphinx_copybutton",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.autosectionlabel",
]
-source_suffix = ".rst"
-master_doc = "index"
-autoclass_content = "class"
+autodoc_mock_imports = ["pyspark", "feature_store_pyspark", "py4j"]
+
+source_suffix = {
+ '.rst': 'restructuredtext',
+ '.ipynb': 'myst-nb',
+ '.md': 'myst-nb',
+}
+
+autoclass_content = "both"
+autodoc_default_flags = ["show-inheritance", "members", "undoc-members"]
autodoc_member_order = "bysource"
default_role = "py:obj"
-html_theme = "haiku"
-htmlhelp_basename = "{}doc".format(project)
+html_theme = "sphinx_book_theme"
+html_theme_options = {
+ "logo": {
+ "text": "SageMaker HyperPod
CLI and SDK",
+ "image_light": "_static/image.png",
+ "image_dark": "_static/image.png",
+ },
+ "repository_url": "https://github.com/aws/sagemaker-hyperpod-cli",
+ "use_repository_button": True,
+ "use_issues_button": True,
+ "use_edit_page_button": True,
+ "path_to_docs": "doc",
+ "show_navbar_depth": 2,
+ "use_fullscreen_button": False,
+ "use_download_button": False,
+ "home_page_in_toc": True,
+ # Configuration to disable right-side table of contents
+ "secondary_sidebar_items": [], # Remove all content from right sidebar
+ "show_toc_level": 0, # Disable automatic TOC generation
+}
+
+author = "Amazon Web Services"
+copyright = f"{datetime.datetime.now().year}, Amazon Web Services"
+htmlhelp_basename = "{}doc".format(project)
+html_static_path = ["_static"]
+html_css_files = ["custom.css",
+ "search_accessories.css",
+ ]
napoleon_use_rtype = False
+
+# nbsphinx configuration
+nbsphinx_allow_errors = True
+nbsphinx_kernel_name = 'python3'
+
+# MyST-NB configuration
+myst_enable_extensions = [
+ "amsmath",
+ "colon_fence",
+ "deflist",
+ "dollarmath",
+ "html_image",
+ "html_admonition",
+ # "linkify", # Commented out until linkify-it-py is installed
+ "replacements",
+ "smartquotes",
+ "substitution",
+ "tasklist",
+]
+myst_heading_anchors = 3
+nb_execution_mode = "off"
+
+# Make version available to MyST templates
+myst_substitutions = {
+ "version": version,
+}
+
+# Automatically extract typehints when specified and place them in
+# descriptions of the relevant function/method.
+autodoc_typehints = "description"
+
+
+# autosummary
+autosummary_generate = True
+
+# autosectionlabel
+autosectionlabel_prefix_document = True
\ No newline at end of file
diff --git a/doc/examples.md b/doc/examples.md
new file mode 100644
index 00000000..afda4a66
--- /dev/null
+++ b/doc/examples.md
@@ -0,0 +1,50 @@
+(examples)=
+
+# Example Notebooks
+
+## Training Example Notebooks
+
+For detailed examples of training with HyperPod, see:
+
+::::{grid} 1 2 2 2
+:gutter: 3
+
+:::{grid-item-card} CLI Training Example
+:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/CLI/training-e2e-cli.ipynb
+:class-card: sd-border-primary
+
+**Training Examples** Refer the Training Example.
+:::
+
+:::{grid-item-card} SDK Training Example
+:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/SDK/training_sdk_example.ipynb
+:class-card: sd-border-primary
+
+**Training Examples** Refer the Training SDK Example.
+:::
+
+::::
+
+
+## Inference Example Notebooks
+
+For detailed examples of inference with HyperPod, see:
+
+::::{grid} 1 2 2 2
+:gutter: 3
+
+:::{grid-item-card} CLI Inference Examples
+- CLI Inference JumpStart Model Example
+- CLI Inference FSX Model Example
+- CLI Inference S3 Model Example
+
+:::
+
+:::{grid-item-card} SDK Inference Example
+- SDK Inference JumpStart Model Example
+- SDK Inference FSX Model Example
+- SDK Inference S3 Model Example
+
+:::
+
+::::
diff --git a/doc/getting_started.md b/doc/getting_started.md
new file mode 100644
index 00000000..a7b34103
--- /dev/null
+++ b/doc/getting_started.md
@@ -0,0 +1,91 @@
+(getting_started)=
+
+# Getting Started
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+Training
+Inference
+
+```
+
+This guide will help you get started with the SageMaker HyperPod CLI and SDK to perform basic operations.
+
+## List Available Clusters
+
+List all available SageMaker HyperPod clusters in your account:
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp list-cluster [--region ]
+```
+````
+
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod import list_clusters
+
+list_clusters(region='aws-region')
+
+```
+````
+`````
+
+## Connect to a Cluster
+
+Configure your local kubectl environment to interact with a specific SageMaker HyperPod cluster and namespace:
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp set-cluster-context --cluster-name
+```
+````
+
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod import set_cluster_context
+
+set_cluster_context('')
+
+```
+````
+`````
+
+## Get Current Cluster Context
+
+View information about the currently configured cluster context:
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp get-cluster-context
+```
+````
+
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod import get_cluster_context
+
+get_cluster_context()
+```
+````
+`````
+
+
+## Next Steps
+
+After setting up your environment and connecting to a cluster, you can:
+
+- Create and manage PyTorch training jobs
+- Deploy and manage inference endpoints
+- Monitor cluster resources and job performance
+
+For more detailed information on specific commands, use the `--help` flag:
+
+```bash
+hyp --help
+```
\ No newline at end of file
diff --git a/doc/index.md b/doc/index.md
new file mode 100644
index 00000000..8551d445
--- /dev/null
+++ b/doc/index.md
@@ -0,0 +1,135 @@
+---
+keywords:
+ - distributed
+ - kubernetes
+ - pytorch
+ - monitoring
+ - jumpstart
+---
+
+(hpcli_docs_mainpage)=
+
+# Overview
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+Installation
+Getting Started
+CLI Reference
+SDK reference
+Advanced Resources
+```
+
+Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and SDK. These tools handle infrastructure management complexities, allowing you to focus on model development and innovation. Whether it's scaling your PyTorch training jobs across thousands of GPUs, deploying production-grade inference endpoints or managing multiple clusters efficiently; the intuitive command-line interface and programmatic control enable you to:
+- Accelerate development cycles and reduce operational overhead
+- Automate ML workflows while maintaining operational visibility
+- Optimize computing resources across your AI/ML projects
+
+
+```{note}
+Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI and SDK v3.0.0.
+```
+
+
+```{admonition} What's New
+:class: important
+
+🚀 We are excited to announce general availability of Amazon SageMaker HyperPod CLI and SDK!
+
+
+**Major Updates**:
+- **Distributed Training**: Scale PyTorch jobs across multiple nodes and GPUs with simplified management and automatic fault tolerance.
+- **Model Inference**: Deploy pre-trained models from SageMaker JumpStart and host custom auto-scaling inference endpoints.
+- **Observability**: Connect to and manage multiple HyperPod clusters with enhanced monitoring capabilities.
+- **Usability Improvements**: Intuitive CLI for quick experimentation and cluster management, granular SDK control over workload configurations and easy access to system logs and observability dashboards for efficient debugging
+
+```
+
+## Quick Start
+
+
+::::{grid} 1 2 2 2
+:gutter: 3
+
+:::{grid-item-card} Installation
+:link: installation
+:link-type: ref
+:class-card: sd-border-primary
+
+**New to HyperPod?** Install the CLI/ SDK in minutes.
+:::
+
+:::{grid-item-card} Getting Started
+:link: getting_started
+:link-type: ref
+:class-card: sd-border-secondary
+
+**Ready to explore?** Connect to your cluster before running ML workflows.
+:::
+
+:::{grid-item-card} Training
+:link: training
+:link-type: ref
+:class-card: sd-border-secondary
+
+**Scale Your ML Models!** Get started with training
+:::
+
+:::{grid-item-card} Inference
+:link: inference
+:link-type: ref
+:class-card: sd-border-secondary
+
+**Deploy Your ML Model!** Get started with inference
+:::
+
+::::
+
+## Advanced Resources
+
+::::{grid} 1 2 2 2
+:gutter: 3
+
+:::{grid-item-card} API reference
+:link: api/api_index.html
+:class-card: sd-border-primary
+
+**Explore APIs** - Checkout API Documentation
+:::
+
+:::{grid-item-card} Github
+:link: examples
+:link-type: ref
+:class-card: sd-border-secondary
+
+**Example Notebooks** - Ready-to-use implementation guides
+:::
+
+:::{grid-item-card} AWS SageMaker HyperPod Docs
+:link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html
+:link-type: url
+:class-card: sd-border-secondary
+
+**HyperPod Documentation** - Know more about HyperPod
+:::
+
+:::{grid-item-card} HyperPod Developer Guide
+:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US
+:link-type: url
+:class-card: sd-border-secondary
+
+**Developer Guide** - Refer to this practical development guide
+:::
+
+:::{grid-item-card} SageMaker HyperPod Workshop
+:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US
+:link-type: url
+:class-card: sd-border-secondary
+
+**Practical Guide** - Refer to the workshop for detailed follow-through steps
+:::
+
+
+::::
diff --git a/doc/index.rst b/doc/index.rst
deleted file mode 100644
index 0f5525de..00000000
--- a/doc/index.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-HyperpodCLI
-=======================
-
-Please replace this text with a short description of your package.
-
-.. toctree::
-
- _apidoc/modules
-
-
-Indices and tables
-__________________
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/doc/inference.md b/doc/inference.md
new file mode 100644
index 00000000..e187d48c
--- /dev/null
+++ b/doc/inference.md
@@ -0,0 +1,324 @@
+(inference)=
+
+# Inference with SageMaker HyperPod
+
+SageMaker HyperPod provides powerful capabilities for deploying and managing inference endpoints on EKS-hosted clusters. This guide covers how to create, invoke, and manage inference endpoints using both the HyperPod CLI and SDK.
+
+## Overview
+
+SageMaker HyperPod inference endpoints allow you to:
+
+- Deploy pre-trained JumpStart models
+- Deploy custom models with your own inference code
+- Configure resource requirements for inference
+- Manage endpoint lifecycle
+- Invoke endpoints for real-time predictions
+- Monitor endpoint performance
+
+## Creating Inference Endpoints
+
+You can create inference endpoints using either JumpStart models or custom models:
+
+### JumpStart Model Endpoints
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp create hyp-jumpstart-endpoint \
+ --model-id jumpstart-model-id \
+ --instance-type ml.g5.8xlarge \
+ --endpoint-name endpoint-jumpstart
+```
+````
+
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig
+from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint
+
+model = Model(
+ model_id="deepseek-llm-r1-distill-qwen-1-5b",
+ model_version="2.0.4"
+)
+
+server = Server(
+ instance_type="ml.g5.8xlarge"
+)
+
+endpoint_name = SageMakerEndpoint(name="endpoint-jumpstart")
+
+tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket")
+
+js_endpoint = HPJumpStartEndpoint(
+ model=model,
+ server=server,
+ sage_maker_endpoint=endpoint_name,
+ tls_config=tls_config
+)
+
+js_endpoint.create()
+```
+````
+`````
+
+### Custom Model Endpoints
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp create hyp-custom-endpoint \
+ --version 1.0 \
+ --endpoint-name endpoint-s3 \
+ --model-name \
+ --model-source-type s3 \
+ --instance-type \
+ --image-uri \
+ --container-port 8080 \
+ --model-volume-mount-name model-weights
+```
+````
+
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.inference.config.hp_custom_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig, EnvironmentVariables
+from sagemaker.hyperpod.inference.hp_custom_endpoint import HPEndpoint
+
+model = Model(
+ model_source_type="s3",
+ model_location="test-pytorch-job/model.tar.gz",
+ s3_bucket_name="my-bucket",
+ s3_region="us-east-2",
+ prefetch_enabled=True
+)
+
+server = Server(
+ instance_type="ml.g5.8xlarge",
+ image_uri="763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0",
+ container_port=8080,
+ model_volume_mount_name="model-weights"
+)
+
+resources = {
+ "requests": {"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"},
+ "limits": {"nvidia.com/gpu": 1}
+}
+
+env = EnvironmentVariables(
+ HF_MODEL_ID="/opt/ml/model",
+ SAGEMAKER_PROGRAM="inference.py",
+ SAGEMAKER_SUBMIT_DIRECTORY="/opt/ml/model/code",
+ MODEL_CACHE_ROOT="/opt/ml/model",
+ SAGEMAKER_ENV="1"
+)
+
+endpoint_name = SageMakerEndpoint(name="endpoint-custom-pytorch")
+
+tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket")
+
+custom_endpoint = HPEndpoint(
+ model=model,
+ server=server,
+ resources=resources,
+ environment=env,
+ sage_maker_endpoint=endpoint_name,
+ tls_config=tls_config,
+)
+
+custom_endpoint.create()
+```
+````
+`````
+
+### Key Parameters
+
+When creating an inference endpoint, you'll need to specify:
+
+- **endpoint-name**: Unique identifier for your endpoint
+- **instance-type**: The EC2 instance type to use
+- **model-id** (JumpStart): ID of the pre-trained JumpStart model
+- **image-uri** (Custom): Docker image containing your inference code
+- **model-name** (Custom): Name of model to create on SageMaker
+- **model-source-type** (Custom): Source type: fsx or s3
+- **model-volume-mount-name** (Custom): Name of the model volume mount
+- **container-port** (Custom): Port on which the model server listens
+
+## Managing Inference Endpoints
+
+### List Endpoints
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+# List JumpStart endpoints
+hyp list hyp-jumpstart-endpoint
+
+# List custom endpoints
+hyp list hyp-custom-endpoint
+```
+````
+
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint
+from sagemaker.hyperpod.inference.hp_custom_endpoint import HPEndpoint
+
+# List JumpStart endpoints
+jumpstart_endpoints = HPJumpStartEndpoint.list()
+print(jumpstart_endpoints)
+
+# List custom endpoints
+custom_endpoints = HPEndpoint.list()
+print(custom_endpoints)
+```
+````
+`````
+
+### Describe an Endpoint
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+# Describe JumpStart endpoint
+hyp describe hyp-jumpstart-endpoint --name
+
+# Describe custom endpoint
+hyp describe hyp-custom-endpoint --name
+```
+````
+
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint
+from sagemaker.hyperpod.inference.hp_custom_endpoint import HPEndpoint
+
+# Get JumpStart endpoint details
+jumpstart_endpoint = HPJumpStartEndpoint.get(name="js-endpoint-name", namespace="test")
+print(jumpstart_endpoint)
+
+# Get custom endpoint details
+custom_endpoint = HPEndpoint.get(endpoint_name="endpoint-custom")
+print(custom_endpoint)
+
+```
+````
+`````
+
+### Invoke an Endpoint
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+# Invoke Jumpstart endpoint
+hyp invoke hyp-jumpstart-endpoint \
+ --endpoint-name \
+ --body '{"inputs":"What is the capital of USA?"}'
+
+# Invoke custom endpoint
+hyp invoke hyp-custom-endpoint \
+ --endpoint-name \
+ --body '{"inputs": "What is machine learning?"}'
+```
+````
+
+````{tab-item} SDK
+```python
+data = '{"inputs":"What is the capital of USA?"}'
+response = endpoint.invoke(body=data).body.read()
+print(response)
+```
+````
+`````
+
+### List Pods
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+# JumpStart endpoint
+hyp list-pods hyp-jumpstart-endpoint
+
+# Custom endpoint
+hyp list-pods hyp-custom-endpoint
+```
+````
+`````
+
+### Get Logs
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+# JumpStart endpoint
+hyp get-logs hyp-jumpstart-endpoint --pod-name
+
+# Custom endpoint
+hyp get-logs hyp-custom-endpoint --pod-name
+```
+````
+`````
+
+### Get Operator Logs
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+# JumpStart endpoint
+hyp get-operator-logs hyp-jumpstart-endpoint --since-hours 0.5
+
+# Custom endpoint
+hyp get-operator-logs hyp-custom-endpoint --since-hours 0.5
+```
+````
+
+````{tab-item} SDK
+```python
+print(endpoint.get_operator_logs(since_hours=0.1))
+```
+````
+`````
+
+### Delete an Endpoint
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+# Delete JumpStart endpoint
+hyp delete hyp-jumpstart-endpoint --name
+
+# Delete custom endpoint
+hyp delete hyp-custom-endpoint --name
+```
+````
+
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint
+from sagemaker.hyperpod.inference.hp_custom_endpoint import HPEndpoint
+
+# Delete JumpStart endpoint
+jumpstart_endpoint = HPJumpStartEndpoint.get(endpoint_name="endpoint-jumpstart")
+jumpstart_endpoint.delete()
+
+# Delete custom endpoint
+custom_endpoint = HPEndpoint.get(endpoint_name="endpoint-custom")
+custom_endpoint.delete()
+```
+````
+`````
+
+## Inference Example Notebooks
+
+For detailed examples of inference with HyperPod, explore these interactive Jupyter notebooks:
+
+CLI Examples:
+- CLI Inference FSX Model Example
+- CLI Inference JumpStart Model Example
+- CLI Inference S3 Model Example
+
+SDK Examples:
+- SDK Inference FSX Model Example
+- SDK Inference JumpStart Model Example
+- SDK Inference S3 Model Example
+
+These Jupyter notebooks demonstrate comprehensive workflows for deploying and managing inference endpoints using different model storage options and both CLI and SDK approaches. You can run these notebooks directly
+in your local environment or SageMaker Studio.
diff --git a/doc/installation.md b/doc/installation.md
new file mode 100644
index 00000000..77992f9a
--- /dev/null
+++ b/doc/installation.md
@@ -0,0 +1,62 @@
+(installation)=
+# Get Started
+This guide provides installation instructions for the SageMaker HyperPod CLI and SDK.
+
+## System Requirements
+
+### Supported Platforms
+- Linux
+- macOS
+
+```{note}
+ Windows is not supported at this time.
+```
+
+### Supported ML Frameworks
+- PyTorch (version ≥ 1.10)
+
+### Supported Python Versions
+- 3.9 and above
+
+## Prerequisites
+
+### For Training
+SageMaker HyperPod CLI currently supports `HyperPodPytorchJob` training workloads.
+To run these jobs, install the **SageMaker Training Operator**.
+
+[Install the SageMaker Training Operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator-install.html)
+
+### For Inference
+The CLI supports creating inference endpoints using JumpStart models or custom models.
+To enable this, install the **SageMaker Inference Operator**.
+
+[Install the SageMaker Inference Operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-model-deployment-setup.html)
+
+## Installation Options
+
+### Install from PyPI
+
+It's recommended to install the SageMaker HyperPod CLI and SDK in a Python virtual environment to avoid conflicts with other packages:
+```bash
+# Create a virtual environment
+python -m venv {venv-name}
+
+# Activate the virtual environment
+source {venv-name}/bin/activate
+```
+```{note}
+Remember to activate your virtual environment (source {venv-name}/bin/activate) each time you want to use the HyperPod CLI and SDK if you chose the virtual environment installation method.
+```
+You can install the SageMaker HyperPod CLI and SDK directly using `pip`:
+
+```bash
+# Install from PyPI
+pip install sagemaker-hyperpod
+```
+
+To verify that the installation was successful, run:
+
+```bash
+# Verify CLI installation
+hyp --help
+```
diff --git a/doc/requirements.txt b/doc/requirements.txt
new file mode 100644
index 00000000..a9f4a087
--- /dev/null
+++ b/doc/requirements.txt
@@ -0,0 +1,10 @@
+sphinx>=4.0.0,<8.0.0
+nbsphinx>=0.8.8
+myst-nb>=0.17.1
+ipykernel>=6.0.0
+jupyter>=1.0.0
+sphinx-book-theme>=1.0.0
+linkify-it-py>=2.0.0
+sphinx-design>=0.5.0
+sphinx-tabs>=3.4.1
+sphinx-copybutton
diff --git a/doc/training.md b/doc/training.md
new file mode 100644
index 00000000..181f826a
--- /dev/null
+++ b/doc/training.md
@@ -0,0 +1,199 @@
+---
+keywords:
+ - distributed
+ - kubernetes
+ - pytorch
+ - containerized
+ - orchestration
+---
+
+(training)=
+
+# Training with SageMaker HyperPod
+
+SageMaker HyperPod provides powerful capabilities for running distributed training workloads on EKS-orchestrated clusters. This guide covers how to create and manage training jobs using both the HyperPod CLI and SDK.
+
+## Overview
+
+SageMaker HyperPod training jobs allow you to:
+
+- Run distributed PyTorch training workloads
+- Specify custom Docker images with your training code
+- Configure resource requirements (instance types, GPUs)
+- Set up node selection with label selectors
+- Manage job scheduling and priorities
+- Mount volumes and persistent volume claims
+
+## Creating Training Jobs
+
+You can create training jobs using either the CLI or SDK approach:
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp create hyp-pytorch-job \
+ --job-name test-pytorch-job \
+ --image pytorch/pytorch:latest \
+```
+````
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.training import (
+ HyperPodPytorchJob,
+ Containers,
+ ReplicaSpec,
+ Resources,
+ RunPolicy,
+ Spec,
+ Template,
+)
+from sagemaker.hyperpod.common.config import Metadata
+
+
+nproc_per_node="1"
+replica_specs=[
+ ReplicaSpec(
+ name="pod",
+ template=Template(
+ spec=Spec(
+ containers=[
+ Containers(
+ name="container-name",
+ image="448049793756.dkr.ecr.us-west-2.amazonaws.com/ptjob:mnist",
+ image_pull_policy="Always",
+ resources=Resources(
+ requests={"nvidia.com/gpu": "0"},
+ limits={"nvidia.com/gpu": "0"},
+ ),
+ # command=[]
+ )
+ ]
+ )
+ ),
+ )
+]
+run_policy=RunPolicy(clean_pod_policy="None")
+
+pytorch_job = HyperPodPytorchJob(
+ metadata=Metadata(name="demo"),
+ nproc_per_node="1",
+ replica_specs=replica_specs,
+ run_policy=run_policy,
+)
+
+pytorch_job.create()
+```
+````
+`````
+
+### Key Parameters
+
+When creating a training job, you'll need to specify:
+
+- **job-name**: Unique identifier for your training job
+- **image**: Docker image containing your training environment
+
+
+## Managing Training Jobs
+
+### List Training Jobs
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp list hyp-pytorch-job
+```
+````
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.training import HyperPodPytorchJob
+import yaml
+
+# List all PyTorch jobs
+jobs = HyperPodPytorchJob.list()
+print(yaml.dump(jobs))
+```
+````
+`````
+
+### Describe a Training Job
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp describe hyp-pytorch-job --job-name
+```
+````
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.training import HyperPodPytorchJob
+
+# Get an existing job
+job = HyperPodPytorchJob.get(name="my-pytorch-job", namespace="my-namespace")
+
+print(job)
+```
+````
+`````
+
+### List Pods for a Training Job
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp list-pods hyp-pytorch-job --job-name
+```
+````
+
+````{tab-item} SDK
+```python
+print(pytorch_job.list_pods())
+```
+````
+`````
+
+### Get Logs from a Pod
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp get-logs hyp-pytorch-job --pod-name test-pytorch-job-cli-pod-0 --job-name test-pytorch-job-cli
+```
+````
+
+````{tab-item} SDK
+```python
+print(pytorch_job.get_logs_from_pod("pod-name"))
+```
+````
+`````
+
+### Delete a Training Job
+
+`````{tab-set}
+````{tab-item} CLI
+```bash
+hyp delete hyp-pytorch-job --job-name
+```
+````
+````{tab-item} SDK
+```python
+from sagemaker.hyperpod.training import HyperPodPytorchJob
+
+# Get an existing job
+job = HyperPodPytorchJob.get(name="my-pytorch-job", namespace="my-namespace")
+
+# Delete the job
+job.delete()
+```
+````
+`````
+
+## Training Example Notebooks
+
+For detailed examples of training with HyperPod, see:
+
+- CLI Training Example
+- SDK Training Example
+
+These examples demonstrate end-to-end workflows for creating and managing training jobs using both the CLI and SDK approaches.