Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
version: 2

build:
os: ubuntu-22.04
tools:
python: "3.9"

python:
install:
- method: pip
path: .
- requirements: doc/requirements.txt

sphinx:
configuration: doc/conf.py
fail_on_warning: false

formats:
- pdf
- epub
20 changes: 20 additions & 0 deletions doc/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS = -W
SPHINXBUILD = python3 -msphinx
SPHINXPROJ = sagemaker
SOURCEDIR = .
BUILDDIR = _build

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
82 changes: 79 additions & 3 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import datetime
import os
import shutil

import sys
import re
from pathlib import Path

def run_apidoc(app):
"""Generate doc stubs using sphinx-apidoc."""
Expand Down Expand Up @@ -41,8 +43,33 @@ def setup(app):
app.connect("builder-inited", run_apidoc)


# Get version from setup.py
def get_version():
try:
# Find the project root directory (where setup.py is located)
project_root = Path(__file__).parent.parent
setup_py_path = project_root / "setup.py"

# Read setup.py content
with open(setup_py_path, "r") as f:
setup_py_content = f.read()

# Extract version using regex
version_match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', setup_py_content)
if version_match:
return version_match.group(1)
else:
print("Warning: Could not find version in setup.py")
return "unknown"
except Exception as e:
print(f"Warning: Could not extract version from setup.py: {e}")
return "unknown"


# Sphinx configuration below.
project = "SageMaker HyperPod CLI"
version = get_version()
release = version

# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {"python": ("http://docs.python.org/", None)}
Expand All @@ -53,16 +80,65 @@ def setup(app):
"sphinx.ext.napoleon",
"sphinx.ext.todo",
"sphinx.ext.viewcode",
"nbsphinx",
# Use either myst_parser or myst_nb, not both
# "myst_parser",
"myst_nb",
"sphinx_design",
]

source_suffix = ".rst"
# Mock modules that might not be available during documentation build
autodoc_mock_imports = [
'sagemaker.hyperpod.training.config.hyperpod_pytorch_job_config',
'hyperpod_pytorch_job_template.registry'
]

source_suffix = {
'.rst': 'restructuredtext',
'.ipynb': 'myst-nb',
'.md': 'myst-nb',
}
master_doc = "index"

autoclass_content = "class"
autodoc_member_order = "bysource"
default_role = "py:obj"

html_theme = "haiku"
html_theme = "sphinx_book_theme"
html_theme_options = {
"repository_url": "https://github.com/aws/sagemaker-hyperpod-cli",
"use_repository_button": True,
"use_issues_button": True,
"use_edit_page_button": True,
"path_to_docs": "doc",
"show_navbar_depth": 2,
}
htmlhelp_basename = "{}doc".format(project)

napoleon_use_rtype = False

# nbsphinx configuration
nbsphinx_allow_errors = True
nbsphinx_kernel_name = 'python3'

# MyST-NB configuration
myst_enable_extensions = [
"amsmath",
"colon_fence",
"deflist",
"dollarmath",
"html_image",
"html_admonition",
# "linkify", # Commented out until linkify-it-py is installed
"replacements",
"smartquotes",
"substitution",
"tasklist",
]
myst_heading_anchors = 3
nb_execution_mode = "off"

# Make version available to MyST templates
myst_substitutions = {
"version": version,
}
75 changes: 75 additions & 0 deletions doc/getting_started.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"(getting_started)=\n",
"# Getting Started with SageMaker HyperPod CLI\n",
"\n",
"This notebook provides a quick introduction to using the SageMaker HyperPod CLI."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Installation\n",
"\n",
"You can install the SageMaker HyperPod CLI using pip:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pip install sagemaker-hyperpod-cli"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic Usage\n",
"\n",
"Here's a simple example of how to use the SageMaker HyperPod CLI:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import the necessary modules\n",
"import sagemaker_hyperpod_cli\n",
"\n",
"# Example code here\n",
"print(\"Hello from SageMaker HyperPod CLI!\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
58 changes: 58 additions & 0 deletions doc/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
(hpcli_docs_mainpage)=

# SageMaker HyperPod CLI and SDK Documentation

**Version**: {{ version }}

```{toctree}
:hidden:
:maxdepth: 1

Getting Started <getting_started>
API reference <_apidoc/modules>
```

SageMaker HyperPod CLI and SDK provide a seamless way to manage distributed training and inference workloads on EKS-hosted SageMaker HyperPod clusters—without needing Kubernetes expertise. Use the powerful CLI to launch and monitor training jobs and endpoints, or leverage the Python SDK to do the same programmatically with minimal code, including support for JumpStart models, custom endpoints, and built-in monitoring.

::::{container}
::::{grid}
:gutter: 3

:::{grid-item-card} Installation
:link: getting_started
:link-type: ref

Get the CLI/ SDK setup
:::

:::{grid-item-card} Quickstart
:link: getting_started
:link-type: ref

Beginner's guide to using CLI/ SDK
:::

:::{grid-item-card} Training
:link: getting_started
:link-type: ref

Detailed guide on creating Pytorch training jobs
:::

:::{grid-item-card} Inference
:link: getting_started
:link-type: ref

Detailed guide on creating, invoking and monitoring endpoints
:::

:::{grid-item-card} Contributor's Guide
:link: getting_started
:link-type: ref

Improve SageMaker Hyperpod CLI and SDK
:::

::::
::::

16 changes: 0 additions & 16 deletions doc/index.rst

This file was deleted.

8 changes: 8 additions & 0 deletions doc/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
sphinx>=4.0.0,<8.0.0
nbsphinx>=0.8.8
myst-nb>=0.17.1
ipykernel>=6.0.0
jupyter>=1.0.0
sphinx-book-theme>=1.0.0
linkify-it-py>=2.0.0
sphinx-design>=0.5.0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
namespace: "aws-hyperpod"
hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.552.0_1.0.161.0"
hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0"
26 changes: 13 additions & 13 deletions helm_chart/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,19 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system
- Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases
- If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI.
```
IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
```

## 7. Troubleshooting
Expand Down
2 changes: 2 additions & 0 deletions src/sagemaker/hyperpod/cli/training_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ def generate_click_command(
1) Injects click.options from the JSON Schema under `schema_pkg`
2) At runtime, pops `version`, builds the flat model from `registry`, calls .to_domain()
3) Finally invokes your handler as `func(version, domain_config)`

- `version_key`: if given, hard-codes the version (no --version flag injected)
- `schema_pkg`: the importable package root to read schema.json from
- `registry`: a dict mapping version → flat‐model class, e.g. hyperpod_pytorch_job_template.registry.SCHEMA_REGISTRY

"""
if registry is None:
raise ValueError("You must pass a registry mapping version→Model")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def wrapper(*args, **kwargs):
duration = round(perf_counter() - start, 2)
extra += f"&x-latency={duration}"
_send_telemetry_request(
Status.SUCCESS,
STATUS_TO_CODE[str(Status.SUCCESS)],
[FEATURE_TO_CODE[str(feature)]],
None,
None,
Expand All @@ -172,7 +172,7 @@ def wrapper(*args, **kwargs):
duration = round(perf_counter() - start, 2)
extra += f"&x-latency={duration}"
_send_telemetry_request(
Status.FAILURE,
STATUS_TO_CODE[str(Status.FAILURE)],
[FEATURE_TO_CODE[str(feature)]],
None,
str(e),
Expand Down
Loading